From 54c978b022b2445669243509b230859304565598 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 07:52:30 +0200 Subject: [PATCH 001/304] Initial CMake files, testing, benchmarking and example infrastructure --- .gitignore | 39 +++++ CMakeLists.txt | 164 +++++++++++++++++++++ LICENSE.txt | 7 + benchmark/CMakeLists.txt | 26 ++++ cmake/DownloadProject.CMakeLists.cmake.in | 27 ++++ cmake/DownloadProject.cmake | 170 ++++++++++++++++++++++ example/CMakeLists.txt | 25 ++++ example/rocsparse_handle.cpp | 4 + library/CMakeLists.txt | 60 ++++++++ library/include/rocsparse_version.h.in | 0 library/src/CMakeLists.txt | 4 + library/src/context.cpp | 0 library/src/include/context.h | 0 test/CMakeLists.txt | 33 +++++ test/test_device_apis.cpp | 0 test/test_rocsparse_handle.cpp | 0 16 files changed, 559 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 LICENSE.txt create mode 100644 benchmark/CMakeLists.txt create mode 100644 cmake/DownloadProject.CMakeLists.cmake.in create mode 100644 cmake/DownloadProject.cmake create mode 100644 example/CMakeLists.txt create mode 100644 example/rocsparse_handle.cpp create mode 100644 library/CMakeLists.txt create mode 100644 library/include/rocsparse_version.h.in create mode 100644 library/src/CMakeLists.txt create mode 100644 library/src/context.cpp create mode 100644 library/src/include/context.h create mode 100644 test/CMakeLists.txt create mode 100644 test/test_device_apis.cpp create mode 100644 test/test_rocsparse_handle.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f9bbf717 --- /dev/null +++ b/.gitignore @@ -0,0 +1,39 @@ +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# vim tags +tags +.tags +.*.swp + +# Editors +.vscode + +# build-in-source directory +build diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..37058b6d --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,164 @@ +cmake_minimum_required(VERSION 3.5 FATAL_ERROR) + +# CMake modules +list(APPEND CMAKE_MODULE_PATH + ${CMAKE_CURRENT_SOURCE_DIR}/cmake + /opt/rocm/hip/cmake +) + +# Find HIP package +find_package(HIP REQUIRED) + +# Set compiler +if(HIP_PLATFORM STREQUAL "nvcc") + message("-- PLATFORM = nvcc currently unsupported") +elseif(HIP_PLATFORM STREQUAL "hcc") + message("-- PLATFORM = hcc") + find_program(HIP_HCC_EXECUTABLE NAMES hcc PATHS + "${HIP_ROOT_DIR}" + ENV ROCM_PATH + ENV HIP_PATH + /opt/rocm + /opt/rocm/hip + PATH_SUFFIXES bin + NO_DEFAULT_PATH + ) + if(NOT HIP_HCC_EXECUTABLE) + find_program(HIP_HCC_EXECUTABLE hcc) + endif() + mark_as_advanced(HIP_HCC_EXECUTABLE) + set(CMAKE_CXX_COMPILER ${HIP_HCC_EXECUTABLE}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument") + list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip) +else() + message(FATAL_ERROR "HIP_PLATFORM must be 'hcc/nvcc' (AMD ROCm platform).") +endif() + +# rocSPARSE project +project(rocsparse VERSION 0.1.0.0 LANGUAGES CXX) + +# Set a default build type if none was specified +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "Setting build type to 'Release' as none was specified.") + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +# CXX Build flags +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") + +# HIP configuration +if(HIP_PLATFORM STREQUAL "hcc") + # Workaround until hcc & hip cmake modules fixes symlink logic in their config files. + # (Thanks to rocBLAS devs for finding workaround for this problem!) + list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip) + # Ignore hcc warning: argument unused during compilation: '-isystem /opt/rocm/hip/include' + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument") + find_package(hcc REQUIRED CONFIG PATHS /opt/rocm) + find_package(hip REQUIRED CONFIG PATHS /opt/rocm) +endif() + +# Build options +option(BUILD_SHARED_LIBS "Build rocSPARSE as a shared library" ON) +option(BUILD_VERBOSE "Output additional build information" ON) +option(BUILD_TEST "Build tests (requires googletest)" ON) +option(BUILD_BENCHMARK "Build benchmarks (requires googlebenchmark)" ON) +option(BUILD_EXAMPLE "Build examples" ON) + +# Test dependencies +if(BUILD_TEST) + if(NOT DEPENDENCIES_FORCE_DOWNLOAD) + find_package(GTest QUIET) + endif() + if(NOT GTEST_FOUND) + message(STATUS "GTest not found. Downloading and building GTest.") + include(cmake/DownloadProject.cmake) + find_package(Git REQUIRED) + set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "") + download_project(PROJ googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG master + INSTALL_DIR ${GTEST_ROOT} + CMAKE_ARGS -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= + LOG_DOWNLOAD TRUE + LOG_CONFIGURE TRUE + LOG_BUILD TRUE + LOG_INSTALL TRUE + BUILD_PROJECT TRUE + UPDATE_DISCONNECTED TRUE + ) + endif() + find_package(GTest REQUIRED) +endif() + +# Benchmark dependencies +if(BUILD_BENCHMARK) + if(NOT DEPENDENCIES_FORCE_DOWNLOAD) + find_package(benchmark QUIET) + endif() + if(NOT benchmark_FOUND) + message(STATUS "Google Benchmark not found. Downloading and building Google Benchmark.") + include(cmake/DownloadProject.cmake) + find_package(Git REQUIRED) + set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/googlebenchmark CACHE PATH "") + download_project(PROJ googlebenchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG master + INSTALL_DIR ${GOOGLEBENCHMARK_ROOT} + CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBENCHMARK_ENABLE_TESTING=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= + LOG_DOWNLOAD TRUE + LOG_CONFIGURE TRUE + LOG_BUILD TRUE + LOG_INSTALL TRUE + BUILD_PROJECT TRUE + UPDATE_DISCONNECTED TRUE + ) + endif() + find_package(benchmark REQUIRED CONFIG PATHS ${GOOGLEBENCHMARK_ROOT}) +endif() + +# ROCm cmake project +find_package(ROCM QUIET CONFIG PATHS /opt/rocm) +if(NOT ROCM_FOUND) + set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") + file( + DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip + ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip + ) + execute_process( + COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + find_package(ROCM REQUIRED CONFIG PATHS ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}) +endif() + +include(ROCMSetupVersion) +include(ROCMCreatePackage) +include(ROCMInstallTargets) +include(ROCMPackageConfigHelpers) +include(ROCMInstallSymlinks) + +# AMD targets +set(AMDGPU_TARGETS gfx803;gfx900 CACHE STRING "List of specific machine types for library to target") + +# rocSPARSE library +add_subdirectory(library) + +# Tests +if(BUILD_TEST) + enable_testing() + add_subdirectory(test) +endif() + +# Benchmarks +if(BUILD_BENCHMARK) + add_subdirectory(benchmark) +endif() + +# Examples +if(BUILD_EXAMPLE) + add_subdirectory(example) +endif() diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000..e0f8156e --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,7 @@ +Copyright © 2016 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 00000000..f37f7465 --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,26 @@ +# Function to add rocsparse benchmarks +function(add_rocsparse_benchmark BENCHMARK_SOURCE) + get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) + add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) + if(HIP_PLATFORM STREQUAL "hcc") + target_link_libraries(${BENCHMARK_TARGET} + PRIVATE + rocsparse + hip::hip_hcc + benchmark::benchmark + ) + foreach(amdgpu_target ${AMDGPU_TARGETS}) + target_link_libraries(${BENCHMARK_TARGET} + PRIVATE + --amdgpu-target=${amdgpu_target} + ) + endforeach() + endif() + set_target_properties(${BENCHMARK_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark" + ) +endfunction() + +# Benchmarks +#add_rocsparse_benchmark(benchmark_spmv.cpp) diff --git a/cmake/DownloadProject.CMakeLists.cmake.in b/cmake/DownloadProject.CMakeLists.cmake.in new file mode 100644 index 00000000..5546c03a --- /dev/null +++ b/cmake/DownloadProject.CMakeLists.cmake.in @@ -0,0 +1,27 @@ +# Distributed under the OSI-approved MIT License. See accompanying +# file LICENSE or https://github.com/Crascit/DownloadProject for details. + +cmake_minimum_required(VERSION 2.8.2) + +project(${DL_ARGS_PROJ}-download NONE) + +include(ExternalProject) +if(${DL_ARGS_BUILD_PROJECT}) + ExternalProject_Add(${DL_ARGS_PROJ}-download + ${DL_ARGS_UNPARSED_ARGUMENTS} + SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" + BUILD_IN_SOURCE TRUE + TEST_COMMAND "" + ) +else() + ExternalProject_Add(${DL_ARGS_PROJ}-download + ${DL_ARGS_UNPARSED_ARGUMENTS} + SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" + BUILD_IN_SOURCE TRUE + TEST_COMMAND "" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + ) +endif() diff --git a/cmake/DownloadProject.cmake b/cmake/DownloadProject.cmake new file mode 100644 index 00000000..54633d34 --- /dev/null +++ b/cmake/DownloadProject.cmake @@ -0,0 +1,170 @@ +# Distributed under the OSI-approved MIT License. See accompanying +# file LICENSE or https://github.com/Crascit/DownloadProject for details. +# +# MODULE: DownloadProject +# +# PROVIDES: +# download_project( PROJ projectName +# [PREFIX prefixDir] +# [DOWNLOAD_DIR downloadDir] +# [SOURCE_DIR srcDir] +# [BINARY_DIR binDir] +# [QUIET] +# ... +# ) +# +# Provides the ability to download and unpack a tarball, zip file, git repository, +# etc. at configure time (i.e. when the cmake command is run). How the downloaded +# and unpacked contents are used is up to the caller, but the motivating case is +# to download source code which can then be included directly in the build with +# add_subdirectory() after the call to download_project(). Source and build +# directories are set up with this in mind. +# +# The PROJ argument is required. The projectName value will be used to construct +# the following variables upon exit (obviously replace projectName with its actual +# value): +# +# projectName_SOURCE_DIR +# projectName_BINARY_DIR +# +# The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically +# need to be provided. They can be specified if you want the downloaded source +# and build directories to be located in a specific place. The contents of +# projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the +# locations used whether you provide SOURCE_DIR/BINARY_DIR or not. +# +# The DOWNLOAD_DIR argument does not normally need to be set. It controls the +# location of the temporary CMake build used to perform the download. +# +# The PREFIX argument can be provided to change the base location of the default +# values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments +# are provided, then PREFIX will have no effect. The default value for PREFIX is +# CMAKE_BINARY_DIR. +# +# The QUIET option can be given if you do not want to show the output associated +# with downloading the specified project. +# +# In addition to the above, any other options are passed through unmodified to +# ExternalProject_Add() to perform the actual download, patch and update steps. +# +# Only those ExternalProject_Add() arguments which relate to downloading, patching +# and updating of the project sources are intended to be used. Also note that at +# least one set of download-related arguments are required. +# +# If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to +# prevent a check at the remote end for changes every time CMake is run +# after the first successful download. See the documentation of the ExternalProject +# module for more information. It is likely you will want to use this option if it +# is available to you. Note, however, that the ExternalProject implementation contains +# bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when +# using the URL download method or when specifying a SOURCE_DIR with no download +# method. Fixes for these have been created, the last of which is scheduled for +# inclusion in CMake 3.8.0. Details can be found here: +# +# https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c +# https://gitlab.kitware.com/cmake/cmake/issues/16428 +# +# If you experience build errors related to the update step, consider avoiding +# the use of UPDATE_DISCONNECTED. +# +# EXAMPLE USAGE: +# +# include(DownloadProject) +# download_project(PROJ googletest +# GIT_REPOSITORY https://github.com/google/googletest.git +# GIT_TAG master +# UPDATE_DISCONNECTED 1 +# QUIET +# ) +# +# add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR}) +# +#======================================================================================== + + +set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}") + +include(CMakeParseArguments) + +function(download_project) + + set(options QUIET) + set(oneValueArgs + PROJ + PREFIX + DOWNLOAD_DIR + SOURCE_DIR + BINARY_DIR + BUILD_PROJECT + ) + set(multiValueArgs "") + + cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + # Hide output if requested + if (DL_ARGS_QUIET) + set(OUTPUT_QUIET "OUTPUT_QUIET") + else() + unset(OUTPUT_QUIET) + message(STATUS "Downloading/updating ${DL_ARGS_PROJ}") + endif() + + # Set up where we will put our temporary CMakeLists.txt file and also + # the base point below which the default source and binary dirs will be. + # The prefix must always be an absolute path. + if (NOT DL_ARGS_PREFIX) + set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}") + else() + get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE + BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}") + endif() + if (NOT DL_ARGS_DOWNLOAD_DIR) + set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download") + endif() + + # Ensure the caller can know where to find the source and build directories + if (NOT DL_ARGS_SOURCE_DIR) + set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src") + endif() + if (NOT DL_ARGS_BINARY_DIR) + set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build") + endif() + set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE) + set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE) + + # The way that CLion manages multiple configurations, it causes a copy of + # the CMakeCache.txt to be copied across due to it not expecting there to + # be a project within a project. This causes the hard-coded paths in the + # cache to be copied and builds to fail. To mitigate this, we simply + # remove the cache if it exists before we configure the new project. It + # is safe to do so because it will be re-generated. Since this is only + # executed at the configure step, it should not cause additional builds or + # downloads. + file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt") + + # Create and build a separate CMake project to carry out the download. + # If we've already previously done these steps, they will not cause + # anything to be updated, so extra rebuilds of the project won't occur. + # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project + # has this set to something not findable on the PATH. + configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in" + "${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt") + execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" + -D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}" + . + RESULT_VARIABLE result + ${OUTPUT_QUIET} + WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}" + ) + if(result) + message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}") + endif() + execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + ${OUTPUT_QUIET} + WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}" + ) + if(result) + message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}") + endif() +endfunction() diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt new file mode 100644 index 00000000..25fd2293 --- /dev/null +++ b/example/CMakeLists.txt @@ -0,0 +1,25 @@ +# Function to add rocsparse examples +function(add_rocsparse_example EXAMPLE_SOURCE) + get_filename_component(EXAMPLE_TARGET ${EXAMPLE_SOURCE} NAME_WE) + add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCE}) + if(HIP_PLATFORM STREQUAL "hcc") + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + rocsparse + hip::hip_hcc + ) + foreach(amdgpu_target ${AMDGPU_TARGETS}) + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + --amdgpu-target=${amdgpu_target} + ) + endforeach() + endif() + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example" + ) +endfunction() + +# Examples +add_rocsparse_example(rocsparse_handle.cpp) diff --git a/example/rocsparse_handle.cpp b/example/rocsparse_handle.cpp new file mode 100644 index 00000000..bd9ec9d1 --- /dev/null +++ b/example/rocsparse_handle.cpp @@ -0,0 +1,4 @@ +int main(int argc, char *argv[]) { + + return 0; +} diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt new file mode 100644 index 00000000..eb968c18 --- /dev/null +++ b/library/CMakeLists.txt @@ -0,0 +1,60 @@ +# Verbose build info +if(BUILD_VERBOSE) + message(STATUS "rocsparse_VERSION: ${rocsparse_VERSION}") + message(STATUS "\t==>CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") + message(STATUS "\t==>BUILD_SHARED_LIBS: ${BUILD_SHARED_LIBS}") + message(STATUS "\t==>CMAKE_INSTALL_PREFIX link: " ${CMAKE_INSTALL_PREFIX}) + message(STATUS "\t==>CMAKE_MODULE_PATH link: " ${CMAKE_MODULE_PATH}) + message(STATUS "\t==>CMAKE_PREFIX_PATH link: " ${CMAKE_PREFIX_PATH}) + message(STATUS "==============") + message(STATUS "\t==>CMAKE_CXX_COMPILER: " ${CMAKE_CXX_FLAGS}) + message(STATUS "\t==>CMAKE_CXX_COMPILER debug: " ${CMAKE_CXX_FLAGS_DEBUG}) + message(STATUS "\t==>CMAKE_CXX_COMPILER release: " ${CMAKE_CXX_FLAGS_RELEASE}) + message(STATUS "\t==>CMAKE_CXX_COMPILER relwithdebinfo: " ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) + message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS: " ${CMAKE_EXE_LINKER_FLAGS}) + message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS_RELEASE: " ${CMAKE_EXE_LINKER_FLAGS_RELEASE}) + message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS: " ${CMAKE_SHARED_LINKER_FLAGS}) + message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE: " ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}) + message(STATUS "==============") + message(STATUS "\t==>CMAKE_SHARED_LIBRARY_C_FLAGS: ${CMAKE_SHARED_LIBRARY_C_FLAGS}") + message(STATUS "\t==>CMAKE_SHARED_LIBRARY_CXX_FLAGS: ${CMAKE_SHARED_LIBRARY_CXX_FLAGS}") + message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS: ${CMAKE_SHARED_LINKER_FLAGS}") + message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_DEBUG: ${CMAKE_SHARED_LINKER_FLAGS_DEBUG}") + message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE: ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}") +endif() + +# Configure a header file to pass the rocSPARSE version +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/include/rocsparse_version.h.in" + "${CMAKE_CURRENT_BINARY_DIR}/include/rocsparse_version.h" + @ONLY +) + +# Include sources +include(src/CMakeLists.txt) + +# Create library from sources +if(BUILD_SHARED_LIBS) + add_library(rocsparse SHARED ${rocsparse_source}) +else() + add_library(rocsparse STATIC ${rocsparse_source}) +endif() +add_library(roc::rocsparse ALIAS rocsparse) + +if(HIP_PLATFORM STREQUAL "hcc") + # Linker targets + target_link_libraries(rocsparse PRIVATE hip::hip_hcc hip::hip_device hcc::hccshared) + + # GPU targets + foreach(target ${AMDGPU_TARGETS}) + target_link_libraries(rocsparse PRIVATE --amdgpu-target=${target}) + endforeach() +endif() + +# Include directories +target_include_directories(rocsparse + PRIVATE $ + PUBLIC $ + $ + $ +) diff --git a/library/include/rocsparse_version.h.in b/library/include/rocsparse_version.h.in new file mode 100644 index 00000000..e69de29b diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt new file mode 100644 index 00000000..d4d986ad --- /dev/null +++ b/library/src/CMakeLists.txt @@ -0,0 +1,4 @@ +# rocSPARSE source +set(rocsparse_source + src/context.cpp +) diff --git a/library/src/context.cpp b/library/src/context.cpp new file mode 100644 index 00000000..e69de29b diff --git a/library/src/include/context.h b/library/src/include/context.h new file mode 100644 index 00000000..e69de29b diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 00000000..0be14b95 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,33 @@ +# Function to add rocsparse tests +function(add_rocsparse_test TEST_NAME TEST_SOURCE) + get_filename_component(TEST_TARGET ${TEST_SOURCE} NAME_WE) + add_executable(${TEST_TARGET} ${TEST_SOURCE}) + target_include_directories(${TEST_TARGET} SYSTEM + PUBLIC + ${GTEST_INCLUDE_DIRS} + ) + if(HIP_PLATFORM STREQUAL "hcc") + target_link_libraries(${TEST_TARGET} + PRIVATE + rocsparse + ${GTEST_BOTH_LIBRARIES} + hip::hip_hcc + hip::hip_device + ) + foreach(amdgpu_target ${AMDGPU_TARGETS}) + target_link_libraries(${TEST_TARGET} + PRIVATE + --amdgpu-target=${amdgpu_target} + ) + endforeach() + endif() + set_target_properties(${TEST_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/test" + ) + add_test(${TEST_NAME} ${TEST_TARGET}) +endfunction() + +# Tests +add_rocsparse_test("device_apis" test_device_apis.cpp) +add_rocsparse_test("rocsparse.handle" test_rocsparse_handle.cpp) diff --git a/test/test_device_apis.cpp b/test/test_device_apis.cpp new file mode 100644 index 00000000..e69de29b diff --git a/test/test_rocsparse_handle.cpp b/test/test_rocsparse_handle.cpp new file mode 100644 index 00000000..e69de29b From fb7129368e753f8ab82c8b4002555aa41af9bc7f Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 07:55:10 +0200 Subject: [PATCH 002/304] device api test --- test/test_device_apis.cpp | 93 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/test/test_device_apis.cpp b/test/test_device_apis.cpp index e69de29b..6637f2c1 100644 --- a/test/test_device_apis.cpp +++ b/test/test_device_apis.cpp @@ -0,0 +1,93 @@ +#include +#include +#include + +#define HIP_CHECK(x) ASSERT_EQ(x, hipSuccess) + +template +__global__ +void axpy_kernel(const T *x, T *y, T a, size_t size) +{ + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < size) + { + y[i] += a * x[i]; + } +} + +TEST(Tests, Saxpy) +{ + size_t N = 100; + + float a = 100.0f; + std::vector x(N, 2.0f); + std::vector y(N, 1.0f); + + float *d_x; + float *d_y; + HIP_CHECK(hipMalloc(&d_x, N*sizeof(float))); + HIP_CHECK(hipMalloc(&d_y, N*sizeof(float))); + HIP_CHECK(hipMemcpy(d_x, x.data(), + N*sizeof(float), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_y, y.data(), + N*sizeof(float), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy_kernel), + dim3((N+255)/256), dim3(256), 0, 0, + d_x, d_y, a, N); + HIP_CHECK(hipPeekAtLastError()); + + HIP_CHECK(hipMemcpy(y.data(), d_y, + N*sizeof(float), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipFree(d_x)); + HIP_CHECK(hipFree(d_y)); + + for(size_t i=0; i x(N, 2.0f); + std::vector y(N, 1.0f); + + double *d_x; + double *d_y; + HIP_CHECK(hipMalloc(&d_x, N*sizeof(double))); + HIP_CHECK(hipMalloc(&d_y, N*sizeof(double))); + HIP_CHECK(hipMemcpy(d_x, x.data(), + N*sizeof(double), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_y, y.data(), + N*sizeof(double), + hipMemcpyHostToDevice)); + HIP_CHECK(hipDeviceSynchronize()); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy_kernel), + dim3((N+255)/256), dim3(256), 0, 0, + d_x, d_y, a, N); + HIP_CHECK(hipPeekAtLastError()); + + HIP_CHECK(hipMemcpy(y.data(), d_y, + N*sizeof(double), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipFree(d_x)); + HIP_CHECK(hipFree(d_y)); + + for(size_t i=0; i Date: Tue, 17 Apr 2018 08:45:54 +0200 Subject: [PATCH 003/304] added cpack, make install, rocsparse_version.h.in and rocsparse_export.h --- CMakeLists.txt | 8 ++++ LICENSE.md | 7 ++++ library/CMakeLists.txt | 53 ++++++++++++++++++++++++++ library/include/rocsparse_version.h.in | 9 +++++ 4 files changed, 77 insertions(+) create mode 100644 LICENSE.md diff --git a/CMakeLists.txt b/CMakeLists.txt index 37058b6d..116d385c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,13 @@ cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +# Consider removing this in the future +# This should appear before the project command, because it does not use FORCE +if( WIN32 ) + set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) +else( ) + set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) +endif( ) + # CMake modules list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 00000000..e0f8156e --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,7 @@ +Copyright © 2016 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index eb968c18..9d2bb556 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -1,3 +1,6 @@ +# Package that helps me set visibility for function names exported from shared library +include(GenerateExportHeader) + # Verbose build info if(BUILD_VERBOSE) message(STATUS "rocsparse_VERSION: ${rocsparse_VERSION}") @@ -23,6 +26,9 @@ if(BUILD_VERBOSE) message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE: ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}") endif() +# .so version +set(rocsparse_SOVERSION 0) + # Configure a header file to pass the rocSPARSE version configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/include/rocsparse_version.h.in" @@ -58,3 +64,50 @@ target_include_directories(rocsparse $ $ ) + +# Target properties +set_target_properties(rocsparse PROPERTIES VERSION ${rocsparse_VERSION} SOVERSION ${rocsparse_SOVERSION} CXX_EXTENSIONS NO) +set_target_properties(rocsparse PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging") +set_target_properties(rocsparse PROPERTIES DEBUG_POSTFIX "-d") +set_target_properties(rocsparse PROPERTIES CXX_VISIBILITY_PRESET "hidden" VISIBILITY_INLINES_HIDDEN ON) +generate_export_header(rocsparse EXPORT_FILE_NAME ${PROJECT_BINARY_DIR}/include/rocsparse_export.h) + +# Installation +rocm_install_targets( + TARGETS rocsparse + INCLUDE + ${CMAKE_SOURCE_DIR}/library/include + ${CMAKE_BINARY_DIR}/include + PREFIX rocsparse +) + +rocm_export_targets( + TARGETS rocsparse-targets + PREFIX rocsparse + DEPENDS PACKAGE hip + NAMESPACE roc:: +) + +# Package specific CPACK vars +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc (>= 1.3)") +set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc >= 1.3") +set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/../LICENSE.md") + +if(NOT CPACK_PACKAGING_INSTALL_PREFIX) + set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") +endif() + +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" "\${CPACK_PACKAGING_INSTALL_PREFIX}/include") + +# Package name +set(package_name rocsparse) + +set(ROCSPARSE_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file") + +rocm_create_package( + NAME ${package_name} + DESCRIPTION "Radeon Open Compute SPARSE library" + MAINTAINER "Nico Trost " + LDCONFIG + LDCONFIG_DIR ${ROCSPARSE_CONFIG_DIR} +) diff --git a/library/include/rocsparse_version.h.in b/library/include/rocsparse_version.h.in index e69de29b..ffeec17a 100644 --- a/library/include/rocsparse_version.h.in +++ b/library/include/rocsparse_version.h.in @@ -0,0 +1,9 @@ +#ifndef ROCSPARSE_VERSION_H_ +#define ROCSPARSE_VERSION_H_ + +#define ROCSPARSE_VERSION_MAJOR @rocsparse_VERSION_MAJOR@ +#define ROCSPARSE_VERSION_MINOR @rocsparse_VERSION_MINOR@ +#define ROCSPARSE_VERSION_PATCH @rocsparse_VERSION_PATCH@ +#define ROCSPARSE_VERSION_TWEAK @rocsparse_VERSION_TWEAK@ + +#endif // ROCSPARSE_VERSION_H_ From 9ef7b6b0310526b1704db517ed41d935602e3248 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 08:46:44 +0200 Subject: [PATCH 004/304] rocsparseContext and rocsparse header --- LICENSE.txt | 7 -- library/include/rocsparse.h | 190 ++++++++++++++++++++++++++++++++++ library/src/context.cpp | 1 + library/src/include/context.h | 35 +++++++ 4 files changed, 226 insertions(+), 7 deletions(-) delete mode 100644 LICENSE.txt create mode 100644 library/include/rocsparse.h diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index e0f8156e..00000000 --- a/LICENSE.txt +++ /dev/null @@ -1,7 +0,0 @@ -Copyright © 2016 Advanced Micro Devices, Inc. - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/library/include/rocsparse.h b/library/include/rocsparse.h new file mode 100644 index 00000000..fad144b1 --- /dev/null +++ b/library/include/rocsparse.h @@ -0,0 +1,190 @@ +#ifndef ROCSPARSE_H_ +#define ROCSPARSE_H_ + +/* !\file + * \brief rocsparse.h exposes a common interface that provides Basic Linear + * Algebra Subroutines for sparse computation using HIP optimized AMD HCC- + * based GPU hardware. This library can also run on CUDA-based NVIDIA GPUs. + */ + +#include "rocsparse_version.h" +#include "rocsparse_export.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief rocsparse status codes definition. */ +typedef enum { + ROCSPARSE_STATUS_SUCCESS = 0, + ROCSPARSE_STATUS_NOT_INITIALIZED = 1, + ROCSPARSE_STATUS_ALLOC_FAILED = 2, + ROCSPARSE_STATUS_INVALID_VALUE = 3, + ROCSPARSE_STATUS_ARCH_MISMATCH = 4, + ROCSPARSE_STATUS_MAPPING_ERROR = 5, + ROCSPARSE_STATUS_EXECUTION_FAILED = 6, + ROCSPARSE_STATUS_INTERNAL_ERROR = 7, + ROCSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8, + ROCSPARSE_STATUS_ZERO_PIVOT = 9, + ROCSPARSE_STATUS_INVALID_POINTER = 10, + ROCSPARSE_STATUS_INVALID_SIZE = 11, + ROCSPARSE_STATUS_MEMORY_ERROR = 12, + ROCSPARSE_STATUS_INVALID_HANDLE = 13 +} rocsparseStatus_t; + +struct rocsparseContext; +typedef struct rocsparseContext *rocsparseHandle_t; + +struct rocsparseMatDescr; +typedef struct rocsparseMatDescr *rocsparseMatDescr_t; + +/*! \brief Used to specify whether the matrix is to be transposed or not. */ +typedef enum { + ROCSPARSE_OPERATION_NON_TRANSPOSE = 0, + ROCSPARSE_OPERATION_TRANSPOSE = 1, + ROCSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 +} rocsparseOperation_t; + +/*! \brief Indicates wether the pointer is device or host pointer. */ +typedef enum { + ROCSPARSE_POINTER_MODE_HOST = 0, + ROCSPARSE_POINTER_MODE_DEVICE = 1 +} rocsparsePointerMode_t; + +/*! \brief Used to specify the matrix index base. */ +typedef enum { + ROCSPARSE_INDEX_BASE_ZERO = 0, + ROCSPARSE_INDEX_BASE_ONE = 1 +} rocsparseIndexBase_t; + +/*! \brief Indicates if layer is active with bitmask. */ +typedef enum { + ROCSPARSE_LAYER_MODE_NONE = 0b0000000000, + ROCSPARSE_LAYER_MODE_LOG_TRACE = 0b0000000001, + ROCSPARSE_LAYER_MODE_LOG_BENCH = 0b0000000010 +} rocsparseLayerMode_t; + + + +/******************************************************************************** + * \brief rocsparseHandle_t is a structure holding the rocsparse library context. + * It must be initialized using rocsparseCreate() + * and the returned handle must be passed + * to all subsequent library function calls. + * It should be destroyed at the end using rocsparseDestroy(). + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle); + +/******************************************************************************** + * \brief destroy handle + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle); + +/******************************************************************************** + * \brief rocsparseCreateMatDescr_t is a structure holding the rocsparse matrix + * descriptor. It must be initialized using rocsparseCreateMatDescr() + * and the retured handle must be passed to all subsequent library function + * calls that involve the matrix. + * It should be destroyed at the end using rocsparseDestroyMatDescr(). + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA); + +/******************************************************************************** + * \brief destroy matrix descriptor + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA); + +/* + * =========================================================================== + * level 1 SPARSE + * =========================================================================== + */ + + + +/* + * =========================================================================== + * level 2 SPARSE + * =========================================================================== + */ + +/*! \brief SPARSE Level 2 API + + \details + csrmv multiplies the dense vector x[i] with scalar alpha and sparse m x n + matrix A that is defined in CSR storage format and add the result to y[i] + that is multiplied by beta, for i = 1 , … , n + + y := alpha * op(A) * x + beta * y, + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + transA operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + alpha scalar alpha. + @param[in] + descrA descriptor of A. + @param[in] + csrValA array of nnz elements of A. + @param[in] + csrRowPtrA array of m+1 elements that point to the start + of every row of A. + @param[in] + csrColIndA array of nnz elements containing the column indices of A. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseScsrmv(rocsparseHandle_t handle, + rocsparseOperation_t transA, + int m, + int n, + int nnz, + const float *alpha, + const rocsparseMatDescr_t descrA, + const float *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const float *x, + const float *beta, + float *y); + +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseDcsrmv(rocsparseHandle_t handle, + rocsparseOperation_t transA, + int m, + int n, + int nnz, + const double *alpha, + const rocsparseMatDescr_t descrA, + const double *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const double *x, + const double *beta, + double *y); + +#ifdef __cplusplus +} +#endif + +#endif // ROCSPARSE_H_ diff --git a/library/src/context.cpp b/library/src/context.cpp index e69de29b..7e33fb62 100644 --- a/library/src/context.cpp +++ b/library/src/context.cpp @@ -0,0 +1 @@ +#include "context.h" diff --git a/library/src/include/context.h b/library/src/include/context.h index e69de29b..87642dba 100644 --- a/library/src/include/context.h +++ b/library/src/include/context.h @@ -0,0 +1,35 @@ +#ifndef ROCSPARSE_CONTEXT_H_ +#define ROCSPARSE_CONTEXT_H_ + +#include "rocsparse.h" + +#include +#include +#include + +/******************************************************************************* + * \brief rocsparseContext is a structure holding the rocsparse library context. +******************************************************************************/ +struct rocsparseContext +{ + rocsparseContext(); + ~rocsparseContext(); + + // device id + int device; + // device properties + hipDeviceProp_t properties; + // stream + hipStream_t stream; + // pointer mode + rocsparsePointerMode_t pointer_mode; + // logging mode + rocsparseLayerMode_t layer_mode; + + std::ofstream log_trace_ofs; + std::ofstream log_bench_ofs; + std::ostream *log_trace_os; + std::ostream *log_bench_os; +}; + +#endif // ROCSPARSE_CONTEXT_H_ From e66a463880c45aebc2763c9169c729bf197ef9ec Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 08:56:04 +0200 Subject: [PATCH 005/304] license header --- CMakeLists.txt | 4 ++++ benchmark/CMakeLists.txt | 4 ++++ example/CMakeLists.txt | 4 ++++ example/rocsparse_handle.cpp | 4 ++++ library/CMakeLists.txt | 4 ++++ library/include/rocsparse.h | 4 ++++ library/include/rocsparse_version.h.in | 4 ++++ library/src/CMakeLists.txt | 4 ++++ library/src/context.cpp | 4 ++++ library/src/include/context.h | 4 ++++ test/CMakeLists.txt | 4 ++++ test/test_device_apis.cpp | 4 ++++ test/test_rocsparse_handle.cpp | 5 +++++ 13 files changed, 53 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 116d385c..9f8465fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,3 +1,7 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# ######################################################################## + cmake_minimum_required(VERSION 3.5 FATAL_ERROR) # Consider removing this in the future diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index f37f7465..2cf0dc8b 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,3 +1,7 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# ######################################################################## + # Function to add rocsparse benchmarks function(add_rocsparse_benchmark BENCHMARK_SOURCE) get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 25fd2293..2c791398 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -1,3 +1,7 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# ######################################################################## + # Function to add rocsparse examples function(add_rocsparse_example EXAMPLE_SOURCE) get_filename_component(EXAMPLE_TARGET ${EXAMPLE_SOURCE} NAME_WE) diff --git a/example/rocsparse_handle.cpp b/example/rocsparse_handle.cpp index bd9ec9d1..ce71752b 100644 --- a/example/rocsparse_handle.cpp +++ b/example/rocsparse_handle.cpp @@ -1,3 +1,7 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + int main(int argc, char *argv[]) { return 0; diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index 9d2bb556..8ba24a3d 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -1,3 +1,7 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# ######################################################################## + # Package that helps me set visibility for function names exported from shared library include(GenerateExportHeader) diff --git a/library/include/rocsparse.h b/library/include/rocsparse.h index fad144b1..c6e392ba 100644 --- a/library/include/rocsparse.h +++ b/library/include/rocsparse.h @@ -1,3 +1,7 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + #ifndef ROCSPARSE_H_ #define ROCSPARSE_H_ diff --git a/library/include/rocsparse_version.h.in b/library/include/rocsparse_version.h.in index ffeec17a..026b6d4c 100644 --- a/library/include/rocsparse_version.h.in +++ b/library/include/rocsparse_version.h.in @@ -1,3 +1,7 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + #ifndef ROCSPARSE_VERSION_H_ #define ROCSPARSE_VERSION_H_ diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index d4d986ad..c6b0c86f 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -1,3 +1,7 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# ######################################################################## + # rocSPARSE source set(rocsparse_source src/context.cpp diff --git a/library/src/context.cpp b/library/src/context.cpp index 7e33fb62..2294d21b 100644 --- a/library/src/context.cpp +++ b/library/src/context.cpp @@ -1 +1,5 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + #include "context.h" diff --git a/library/src/include/context.h b/library/src/include/context.h index 87642dba..5eb5b877 100644 --- a/library/src/include/context.h +++ b/library/src/include/context.h @@ -1,3 +1,7 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + #ifndef ROCSPARSE_CONTEXT_H_ #define ROCSPARSE_CONTEXT_H_ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0be14b95..b4a5e966 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,3 +1,7 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# ######################################################################## + # Function to add rocsparse tests function(add_rocsparse_test TEST_NAME TEST_SOURCE) get_filename_component(TEST_TARGET ${TEST_SOURCE} NAME_WE) diff --git a/test/test_device_apis.cpp b/test/test_device_apis.cpp index 6637f2c1..3bcb8b23 100644 --- a/test/test_device_apis.cpp +++ b/test/test_device_apis.cpp @@ -1,3 +1,7 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + #include #include #include diff --git a/test/test_rocsparse_handle.cpp b/test/test_rocsparse_handle.cpp index e69de29b..84691891 100644 --- a/test/test_rocsparse_handle.cpp +++ b/test/test_rocsparse_handle.cpp @@ -0,0 +1,5 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + + From 49f11f5bd7b9a7184d13ec44f87f41549a3227e7 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 09:14:30 +0200 Subject: [PATCH 006/304] README.md update --- README.md | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 633d7515..d7681912 100644 --- a/README.md +++ b/README.md @@ -1 +1,78 @@ -# rocSparse \ No newline at end of file +# rocSPARSE +rocSPARSE exposes a common interface that provides Basic Linear Algebra Subroutines for sparse computation implemented on top of AMD's Radeon Open Compute [ROCm][] runtime and toolchains. rocSPARSE is created using the [HIP][] programming language and optimized for AMD's latest discrete GPUs. + +## Requirements +* Git +* CMake (3.5 or later) +* AMD [ROCm] platform + +Optional: +* [GTest][] + * Required for tests. + * Use GTEST_ROOT to specify GTest location. + * If [GTest][] is not found, it will be downloaded and built automatically. +* [Google Benchmark][] + * Required for benchmarks. + * If [Google Benchmark][] is not found, it will be downloaded and built automatically. + +## Quickstart rocSPARSE build and install + +#### CMake +All compiler specifications are determined automatically. The compilation process can be performed by +``` +# Clone rocSPARSE using git +git clone https://github.com/ROCmSoftwarePlatform/rocSparse.git + +# Go to rocSPARSE directory, create and go to the build directory +cd rocSPARSE; mkdir build; cd build + +# Configure rocSPARSE +# Build options: +# BUILD_TEST - build tests using [GTest][] (OFF) +# BUILD_BENCHMARK - build benchmarks using [Google Benchmark][] (OFF) +# BUILD_EXAMPLE - build examples (ON) +# BUILD_VERBOSE - verbose output (OFF) +# BUILD_SHARED_LIBS - build rocSPARSE as a shared library (ON) +cmake -DBUILD_TEST=ON .. + +# Build +make + +# Install +[sudo] make install +``` + +## Unit tests +To run unit tests, rocSPARSE has to be built with option -DBUILD_TEST=ON. +``` +# Go to rocSPARSE build directory +cd rocSPARSE; cd build + +# Run all tests +ctest +``` + +## Benchmarks +To run benchmarks, rocSPARSE has to be built with option -DBUILD_BENCHMARK=ON. +``` +# Go to rocSPARSE build directory +cd rocSPARSE/build + +# Run benchmark +./benchmark/benchmark_csrmv +``` + +## Support +Please use [the issue tracker][] for bugs and feature requests. + +## License +The [license file][] can be found in the main repository. + + + +[ROCm]: https://github.com/RadeonOpenCompute/ROCm +[HIP]: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/ +[GTest]: https://github.com/google/googletest +[Google Benchmark]: https://github.com/google/benchmark +[the issue tracker]: https://github.com/ROCmSoftwarePlatform/rocSparse/issues +[license file]: ./LICENSE.md From 8b4056d4c6f63b4fc861992db8279bc6e64bc4a8 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 10:13:17 +0200 Subject: [PATCH 007/304] csrmv infrastructure, benchmark and test --- CMakeLists.txt | 6 +- benchmark/CMakeLists.txt | 2 +- benchmark/benchmark_spmv.cpp | 232 +++++++++++++++++++++++++ benchmark/benchmark_utils.h | 69 ++++++++ example/rocsparse_handle.cpp | 11 +- library/src/CMakeLists.txt | 6 +- library/src/context.cpp | 21 ++- library/src/include/matrix.h | 12 ++ library/src/level2/rocsparse_csrmv.cpp | 67 +++++++ library/src/matrix.cpp | 23 +++ test/CMakeLists.txt | 1 + test/test_rocsparse_csrmv.cpp | 161 +++++++++++++++++ test/test_rocsparse_handle.cpp | 12 +- test/test_utils.h | 69 ++++++++ 14 files changed, 683 insertions(+), 9 deletions(-) create mode 100644 benchmark/benchmark_spmv.cpp create mode 100644 benchmark/benchmark_utils.h create mode 100644 library/src/include/matrix.h create mode 100644 library/src/level2/rocsparse_csrmv.cpp create mode 100644 library/src/matrix.cpp create mode 100644 test/test_rocsparse_csrmv.cpp create mode 100644 test/test_utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f8465fb..504e8a75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,9 +75,9 @@ endif() # Build options option(BUILD_SHARED_LIBS "Build rocSPARSE as a shared library" ON) -option(BUILD_VERBOSE "Output additional build information" ON) -option(BUILD_TEST "Build tests (requires googletest)" ON) -option(BUILD_BENCHMARK "Build benchmarks (requires googlebenchmark)" ON) +option(BUILD_VERBOSE "Output additional build information" OFF) +option(BUILD_TEST "Build tests (requires googletest)" OFF) +option(BUILD_BENCHMARK "Build benchmarks (requires googlebenchmark)" OFF) option(BUILD_EXAMPLE "Build examples" ON) # Test dependencies diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 2cf0dc8b..4e053f26 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -27,4 +27,4 @@ function(add_rocsparse_benchmark BENCHMARK_SOURCE) endfunction() # Benchmarks -#add_rocsparse_benchmark(benchmark_spmv.cpp) +add_rocsparse_benchmark(benchmark_spmv.cpp) diff --git a/benchmark/benchmark_spmv.cpp b/benchmark/benchmark_spmv.cpp new file mode 100644 index 00000000..edd0fc79 --- /dev/null +++ b/benchmark/benchmark_spmv.cpp @@ -0,0 +1,232 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "benchmark/benchmark.h" +#include "benchmark_utils.h" + +#include +#include + +#define HIP_CHECK(stat) \ +{ \ + hipError_t err = stat; \ + if (err != hipSuccess) \ + { \ + fprintf(stderr, "HIP error: %d line: %d\n", err, __LINE__); \ + exit(stat); \ + } \ +} + +#define ROCSPARSE_CHECK(stat) \ +{ \ + rocsparseStatus_t err = stat; \ + if (err != ROCSPARSE_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "ROCSPARSE error: %d line: %d\n", err, __LINE__); \ + exit(stat); \ + } \ +} + +void csrmv(rocsparseHandle_t handle, rocsparseOperation_t trans, + int nrow, int ncol, int nnz, const float *alpha, + rocsparseMatDescr_t descrA, const float *csrValA, + const int *csrRowPtrA, const int *csrColIndA, + const float *x, const float *beta, float *y) +{ + ROCSPARSE_CHECK(rocsparseScsrmv(handle, trans, nrow, ncol, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y)); +} + +void csrmv(rocsparseHandle_t handle, rocsparseOperation_t trans, + int nrow, int ncol, int nnz, const double *alpha, + rocsparseMatDescr_t descrA, const double *csrValA, + const int *csrRowPtrA, const int *csrColIndA, + const double *x, const double *beta, double *y) +{ + ROCSPARSE_CHECK(rocsparseDcsrmv(handle, trans, nrow, ncol, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y)); +} + +template +void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, + rocsparseHandle_t handle, rocsparseOperation_t trans, + int nrow, int ncol, int nnz, rocsparseMatDescr_t descr, + const ValueType *alpha, const ValueType *csrValA, + const int *csrRowPtrA, const int *csrColIndA, + const ValueType *x, const ValueType *beta, ValueType *y) +{ + // Warm up + for (int i=0; i<10; ++i) + { + csrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + nrow, ncol, nnz, alpha, descr, csrValA, + csrRowPtrA, csrColIndA, x, beta, y); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _:state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i=0; i >(end-start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations()*batch* + (sizeof(ValueType)*(2*nrow+nnz)+sizeof(int)*(nrow+1+nnz))); + state.SetItemsProcessed(state.iterations()*batch*2*nnz); +} + +int main(int argc, char *argv[]) +{ + int ndim = 2000; + int trials = 200; + int batch_size = 1; + + // Parse command line + if (argc > 2) + { + ndim = atoi(argv[1]); + } + if (argc > 3) + { + trials = atoi(argv[2]); + } + if (argc > 4) + { + batch_size = atoi(argv[3]); + } + + // rocSPARSE handle + rocsparseHandle_t handle; + ROCSPARSE_CHECK(rocsparseCreate(&handle)); + + benchmark::Initialize(&argc, argv); + + hipStream_t stream = 0; + hipDeviceProp_t devProp; + int device_id = 0; + + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + printf("[HIP] Device name: %s\n", devProp.name); + + // Generate problem + int *Aptr = NULL; + int *Acol = NULL; + float *Avalf = NULL; + double *Avald = NULL; + int nrow = gen2DLaplacianUS(ndim, &Aptr, &Acol, &Avald); + int nnz = Aptr[nrow]; + + Avalf = (float*) malloc(sizeof(float)*nnz); + for (int i=0; i benchmarks = + { + benchmark::RegisterBenchmark("rocsparseScsrmv", run_benchmark, + stream, batch_size, + handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + nrow, nrow, nnz, descrA, &alphaf, dAvalf, + dAptr, dAcol, dxf, &betaf, dyf), + benchmark::RegisterBenchmark("rocsparseDcsrmv", run_benchmark, + stream, batch_size, + handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + nrow, nrow, nnz, descrA, &alphad, dAvald, + dAptr, dAcol, dxd, &betad, dyd) + }; + + for (auto& b:benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + b->Iterations(trials); + } + + benchmark::RunSpecifiedBenchmarks(); + + // Clear up on device + HIP_CHECK(hipFree(dAptr)); + HIP_CHECK(hipFree(dAcol)); + HIP_CHECK(hipFree(dAvalf)); + HIP_CHECK(hipFree(dAvald)); + HIP_CHECK(hipFree(dxf)); + HIP_CHECK(hipFree(dxd)); + HIP_CHECK(hipFree(dyf)); + HIP_CHECK(hipFree(dyd)); + + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); + ROCSPARSE_CHECK(rocsparseDestroy(handle)); + + return 0; +} diff --git a/benchmark/benchmark_utils.h b/benchmark/benchmark_utils.h new file mode 100644 index 00000000..76d2a04e --- /dev/null +++ b/benchmark/benchmark_utils.h @@ -0,0 +1,69 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#ifndef ROCSPARSE_BENCHMARK_UTILS_H_ +#define ROCSPARSE_BENCHMARK_UTILS_H_ + +#include + +template +inline int gen2DLaplacianUS(int ndim, int **rowptr, int **col, T **val) +{ + + int n = ndim * ndim; + int nnz_mat = n * 5 - ndim * 4; + + *rowptr = (int*) malloc((n+1)*sizeof(int)); + *col = (int*) malloc(nnz_mat*sizeof(int)); + *val = (T*) malloc(nnz_mat*sizeof(T)); + + int nnz = 0; + + // Fill local arrays + for (int i=0; i + +int main(int argc, char *argv[]) +{ + rocsparseHandle_t handle; + rocsparseCreate(&handle); + + rocsparseDestroy(handle); return 0; } diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index c6b0c86f..217efc97 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -1,8 +1,12 @@ # ######################################################################## -# Copyright 2016 Advanced Micro Devices, Inc. +# Copyright 2018 Advanced Micro Devices, Inc. # ######################################################################## # rocSPARSE source set(rocsparse_source src/context.cpp + src/matrix.cpp + + + src/level2/rocsparse_csrmv.cpp ) diff --git a/library/src/context.cpp b/library/src/context.cpp index 2294d21b..7271659e 100644 --- a/library/src/context.cpp +++ b/library/src/context.cpp @@ -1,5 +1,24 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ #include "context.h" +#include "rocsparse.h" + +extern "C" rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) +{ + // Check if handle is valid + if (handle == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + else + { + return ROCSPARSE_STATUS_SUCCESS; + } +} + +extern "C" rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) +{ + return ROCSPARSE_STATUS_SUCCESS; +} diff --git a/library/src/include/matrix.h b/library/src/include/matrix.h new file mode 100644 index 00000000..862eee4d --- /dev/null +++ b/library/src/include/matrix.h @@ -0,0 +1,12 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#ifndef ROCSPARSE_MATRIX_H_ +#define ROCSPARSE_MATRIX_H_ + +typedef struct rocsparseMatDescr { + int todo; +} rocsparseMatDescr; + +#endif // ROCSPARSE_MATRIX_H_ diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp new file mode 100644 index 00000000..cb72f837 --- /dev/null +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -0,0 +1,67 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "context.h" +#include "matrix.h" + +template +rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, + rocsparseOperation_t transA, + int m, + int n, + int nnz, + const T *alpha, + const rocsparseMatDescr_t descrA, + const T *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const T *x, + const T *beta, + T *y) +{ + return ROCSPARSE_STATUS_SUCCESS; +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparseStatus_t rocsparseScsrmv(rocsparseHandle_t handle, + rocsparseOperation_t transA, + int m, + int n, + int nnz, + const float *alpha, + const rocsparseMatDescr_t descrA, + const float *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const float *x, + const float *beta, + float *y) +{ + return rocsparseTcsrmv(handle, transA, m, n, nnz, alpha, descrA, + csrValA, csrRowPtrA, csrColIndA, x, beta, y); +} + +extern "C" rocsparseStatus_t rocsparseDcsrmv(rocsparseHandle_t handle, + rocsparseOperation_t transA, + int m, + int n, + int nnz, + const double *alpha, + const rocsparseMatDescr_t descrA, + const double *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const double *x, + const double *beta, + double *y) +{ + return rocsparseTcsrmv(handle, transA, m, n, nnz, alpha, descrA, + csrValA, csrRowPtrA, csrColIndA, x, beta, y); +} diff --git a/library/src/matrix.cpp b/library/src/matrix.cpp new file mode 100644 index 00000000..779e6f06 --- /dev/null +++ b/library/src/matrix.cpp @@ -0,0 +1,23 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "matrix.h" +#include "rocsparse.h" + +extern "C" rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA) +{ + if (descrA == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + else + { + return ROCSPARSE_STATUS_SUCCESS; + } +} + +extern "C" rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA) +{ + return ROCSPARSE_STATUS_SUCCESS; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b4a5e966..c23ac3e4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -35,3 +35,4 @@ endfunction() # Tests add_rocsparse_test("device_apis" test_device_apis.cpp) add_rocsparse_test("rocsparse.handle" test_rocsparse_handle.cpp) +add_rocsparse_test("rocsparse.csrmv" test_rocsparse_csrmv.cpp) diff --git a/test/test_rocsparse_csrmv.cpp b/test/test_rocsparse_csrmv.cpp new file mode 100644 index 00000000..ec5c8a32 --- /dev/null +++ b/test/test_rocsparse_csrmv.cpp @@ -0,0 +1,161 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "test_utils.h" + +#include +#include +#include +#include + +#define HIP_CHECK(x) ASSERT_EQ(x, hipSuccess) +#define ROCSPARSE_CHECK(x) ASSERT_EQ(x, ROCSPARSE_STATUS_SUCCESS) + +TEST(Tests, rocsparseScsrmv) +{ + rocsparseHandle_t handle; + ROCSPARSE_CHECK(rocsparseCreate(&handle)); + + // Generate problem + int *Aptr = NULL; + int *Acol = NULL; + float *Aval = NULL; + int nrow = gen2DLaplacianUS(2000, &Aptr, &Acol, &Aval); + int nnz = Aptr[nrow]; + + // Sample some random data + srand(12345ULL); + + float alpha = (float) rand() / RAND_MAX; + float beta = (float) rand() / RAND_MAX; + + float *x = (float*) malloc(sizeof(float)*nrow); + float *y = (float*) malloc(sizeof(float)*nrow); + for (int i=0; i +#include +#define ROCSPARSE_CHECK(x) ASSERT_EQ(x, ROCSPARSE_STATUS_SUCCESS) + +TEST(Tests, handle) +{ + rocsparseHandle_t handle; + ROCSPARSE_CHECK(rocsparseCreate(&handle)); + ROCSPARSE_CHECK(rocsparseDestroy(handle)); +} diff --git a/test/test_utils.h b/test/test_utils.h new file mode 100644 index 00000000..22824599 --- /dev/null +++ b/test/test_utils.h @@ -0,0 +1,69 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#ifndef ROCSPARSE_TEST_UTILS_H_ +#define ROCSPARSE_TEST_UTILS_H_ + +#include + +template +inline int gen2DLaplacianUS(int ndim, int **rowptr, int **col, T **val) +{ + + int n = ndim * ndim; + int nnz_mat = n * 5 - ndim * 4; + + *rowptr = (int*) malloc((n+1)*sizeof(int)); + *col = (int*) malloc(nnz_mat*sizeof(int)); + *val = (T*) malloc(nnz_mat*sizeof(T)); + + int nnz = 0; + + // Fill local arrays + for (int i=0; i Date: Tue, 17 Apr 2018 11:47:27 +0200 Subject: [PATCH 008/304] context and mat descriptor filled --- library/include/rocsparse.h | 8 ++++++++ library/src/context.cpp | 35 +++++++++++++++++++++++++++++++++++ library/src/include/matrix.h | 17 ++++++++++++++--- library/src/matrix.cpp | 34 +++++++++++++++++++++++++++++++++- 4 files changed, 90 insertions(+), 4 deletions(-) diff --git a/library/include/rocsparse.h b/library/include/rocsparse.h index c6e392ba..5aa3be07 100644 --- a/library/include/rocsparse.h +++ b/library/include/rocsparse.h @@ -61,6 +61,13 @@ typedef enum { ROCSPARSE_INDEX_BASE_ONE = 1 } rocsparseIndexBase_t; +/*! \brief Used to specify the matrix type. */ +typedef enum { + ROCSPARSE_MATRIX_TYPE_GENERAL = 0, + ROCSPARSE_MATRIX_TYPE_SYMMETRIC = 1, + ROCSPARSE_MATRIX_TYPE_HERMITIAN = 2 +} rocsparseMatrixType_t; + /*! \brief Indicates if layer is active with bitmask. */ typedef enum { ROCSPARSE_LAYER_MODE_NONE = 0b0000000000, @@ -70,6 +77,7 @@ typedef enum { + /******************************************************************************** * \brief rocsparseHandle_t is a structure holding the rocsparse library context. * It must be initialized using rocsparseCreate() diff --git a/library/src/context.cpp b/library/src/context.cpp index 7271659e..ce053969 100644 --- a/library/src/context.cpp +++ b/library/src/context.cpp @@ -5,6 +5,23 @@ #include "context.h" #include "rocsparse.h" + + +rocsparseContext::rocsparseContext() +{ + // Default is system stream + stream = 0; + // Default pointer mode is host + pointer_mode = ROCSPARSE_POINTER_MODE_HOST; + +} + +rocsparseContext::~rocsparseContext() +{ +} + + + extern "C" rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) { // Check if handle is valid @@ -14,11 +31,29 @@ extern "C" rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) } else { + // Allocate + try + { + *handle = new rocsparseContext; + } + catch(rocsparseStatus_t status) + { + return status; + } return ROCSPARSE_STATUS_SUCCESS; } } extern "C" rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) { + // Destruct + try + { + delete handle; + } + catch(rocsparseStatus_t status) + { + return status; + } return ROCSPARSE_STATUS_SUCCESS; } diff --git a/library/src/include/matrix.h b/library/src/include/matrix.h index 862eee4d..392db518 100644 --- a/library/src/include/matrix.h +++ b/library/src/include/matrix.h @@ -5,8 +5,19 @@ #ifndef ROCSPARSE_MATRIX_H_ #define ROCSPARSE_MATRIX_H_ -typedef struct rocsparseMatDescr { - int todo; -} rocsparseMatDescr; +#include "rocsparse.h" + +struct rocsparseMatDescr +{ + // Constructor + rocsparseMatDescr(); + // Destructor + ~rocsparseMatDescr(); + + // Matrix index base + rocsparseIndexBase_t base; + // Matrix type + rocsparseMatrixType_t type; +}; #endif // ROCSPARSE_MATRIX_H_ diff --git a/library/src/matrix.cpp b/library/src/matrix.cpp index 779e6f06..93d90963 100644 --- a/library/src/matrix.cpp +++ b/library/src/matrix.cpp @@ -1,10 +1,24 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ #include "matrix.h" #include "rocsparse.h" + + +rocsparseMatDescr::rocsparseMatDescr() +{ + base = ROCSPARSE_INDEX_BASE_ZERO; + type = ROCSPARSE_MATRIX_TYPE_GENERAL; +} + +rocsparseMatDescr::~rocsparseMatDescr() +{ +} + + + extern "C" rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA) { if (descrA == nullptr) @@ -13,11 +27,29 @@ extern "C" rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA } else { + // Allocate + try + { + *descrA = new rocsparseMatDescr; + } + catch(rocsparseStatus_t status) + { + return status; + } return ROCSPARSE_STATUS_SUCCESS; } } extern "C" rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA) { + // Destruct + try + { + delete descrA; + } + catch(rocsparseStatus_t status) + { + return status; + } return ROCSPARSE_STATUS_SUCCESS; } From 882ef4263d1e3bdca7086fee02638adb82e29ca0 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 11:48:02 +0200 Subject: [PATCH 009/304] sanity checks for csrmv --- library/src/level2/rocsparse_csrmv.cpp | 92 ++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index cb72f837..3563c936 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -21,6 +21,98 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, const T *beta, T *y) { + // Check for valid handle and matrix descriptor + if (handle == nullptr) + { + return ROCSPARSE_STATUS_NOT_INITIALIZED; + } + else if (descrA == nullptr) + { + return ROCSPARSE_STATUS_NOT_INITIALIZED; + } + + // Logging + if (handle->pointer_mode == ROCSPARSE_POINTER_MODE_HOST) + { + // TODO + } + + // Check matrix type + if (descrA->base != ROCSPARSE_INDEX_BASE_ZERO) + { + // TODO + return ROCSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED; + } + if (descrA->type != ROCSPARSE_MATRIX_TYPE_GENERAL) + { + // TODO + return ROCSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED; + } + + + // Check sizes + if (m < 0) + { + return ROCSPARSE_STATUS_INVALID_VALUE; + } + else if (n < 0) + { + return ROCSPARSE_STATUS_INVALID_VALUE; + } + else if (nnz < 0) + { + return ROCSPARSE_STATUS_INVALID_VALUE; + } + + // Check pointer arguments + if (csrValA == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + else if (csrRowPtrA == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + else if (csrColIndA == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + else if (x == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + else if (y == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + else if (alpha == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + else if (beta == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + + // Quick return if possible + if (m == 0 || n == 0 || nnz == 0) + { + return ROCSPARSE_STATUS_SUCCESS; + } + + // Stream + hipStream_t stream = handle->stream; + + // Run different csrmv kernels + if (transA == ROCSPARSE_OPERATION_NON_TRANSPOSE) + { + // TODO + } + else + { + // TODO + return ROCSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED; + } return ROCSPARSE_STATUS_SUCCESS; } From 3ea5296af011d2dc17377f9d9cf62f9ec1bc290c Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 13:57:56 +0200 Subject: [PATCH 010/304] copied over logging, utility and status functions from rocblas --- library/src/include/definitions.h | 26 ++++ library/src/include/logging.h | 210 ++++++++++++++++++++++++++++++ library/src/include/status.h | 18 +++ library/src/include/utility.h | 85 ++++++++++++ library/src/status.cpp | 46 +++++++ 5 files changed, 385 insertions(+) create mode 100644 library/src/include/definitions.h create mode 100644 library/src/include/logging.h create mode 100644 library/src/include/status.h create mode 100644 library/src/include/utility.h create mode 100644 library/src/status.cpp diff --git a/library/src/include/definitions.h b/library/src/include/definitions.h new file mode 100644 index 00000000..2299cf60 --- /dev/null +++ b/library/src/include/definitions.h @@ -0,0 +1,26 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +#ifndef ROCSPARSE_DEFINITIONS_H_ +#define ROCSPARSE_DEFINITIONS_H_ + +#include "status.h" + +/******************************************************************************* + * Definitions + * this file to not include any others + * thereby it can include top-level definitions included by all + ******************************************************************************/ + +#define THROW_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if(TMP_STATUS_FOR_CHECK != hipSuccess) \ + { \ + throw get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ + } \ + } + +#endif // ROCSPARSE_DEFINITIONS_H_ diff --git a/library/src/include/logging.h b/library/src/include/logging.h new file mode 100644 index 00000000..48afaacd --- /dev/null +++ b/library/src/include/logging.h @@ -0,0 +1,210 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#ifndef ROCSPARSE_LOGGING_H_ +#define ROCSPARSE_LOGGING_H_ + +#include +#include + +/** + * @brief Logging function + * + * @details + * open_log_stream Open stream log_os for logging. + * If the environment variable with name environment_variable_name + * is not set, then stream log_os to std::cerr. + * Else open a file at the full logfile path contained in + * the environment variable. + * If opening the file suceeds, stream to the file + * else stream to std::cerr. + * + * @param[in] + * environment_variable_name std::string + * Name of environment variable that contains + * the full logfile path. + * + * @parm[out] + * log_os std::ostream** + * Output stream. Stream to std:err if environment_variable_name + * is not set, else set to stream to log_ofs + * + * @parm[out] + * log_ofs std::ofstream* + * Output file stream. If log_ofs->is_open()==true, then log_os + * will stream to log_ofs. Else it will stream to std::cerr. + */ + +inline void open_log_stream(std::ostream** log_os, + std::ofstream* log_ofs, + std::string environment_variable_name) +{ + *log_os = &std::cerr; + + char const* environment_variable_value = getenv(environment_variable_name.c_str()); + + if(environment_variable_value != NULL) + { + // if environment variable is set, open file at logfile_pathname contained in the + // environment variable + std::string logfile_pathname = (std::string)environment_variable_value; + log_ofs->open(logfile_pathname); + + // if log_ofs is open, then stream to log_ofs, else log_os is already + // set equal to std::cerr + if(log_ofs->is_open() == true) + { + *log_os = log_ofs; + } + } +} + +/** + * @brief Invoke functor for each argument in variadic parameter pack. + * @detail + * The variatic template function each_args applies the functor f + * to each argument in the expansion of the parameter pack xs... + + * Note that in ((void)f(xs),0) the C/C++ comma operator evaluates + * the first expression (void)f(xs) and discards the output, then + * it evaluates the second expression 0 and returns the output 0. + + * It thus calls (void)f(xs) on each parameter in xs... as a bye-product of + * building the initializer_list 0,0,0,...0. The initializer_list is discarded. + * + * @param f functor to apply to each argument + * + * @parm xs variadic parameter pack with list of arguments + */ +template +void each_args(F f, Ts&... xs) +{ + (void)std::initializer_list{((void)f(xs), 0)...}; +} + +/** + * @brief Workaround for gcc warnings when each_args called with single argument + * and no parameter pack. + */ +template +void each_args(F) +{ +} + +/** + * @brief Functor for logging arguments + * + * @details Functor to log single argument to ofs. + * The overloaded () in log_arg is the function call operator. + * The definition in log_arg says "objects of type log_arg can have + * the function call operator () applied to them with operand x, + * and it will output x to ofs and return void". + */ +struct log_arg +{ + log_arg(std::ostream& os, std::string& separator) : os_(os), separator_(separator) {} + + /// Generic overload for () operator. + template + void operator()(T& x) const + { + os_ << separator_ << x; + } +/* + /// Overload () operator for rocsparse_float_complex. + void operator()(const rocsparse_float_complex complex_value) const + { + os_ << separator_ << complex_value.x << separator_ << complex_value.y; + } + + /// Overload () operator for rocsparse_double_complex. + void operator()(const rocsparse_double_complex complex_value) const + { + os_ << separator_ << complex_value.x << separator_ << complex_value.y; + } +*/ + private: + std::ostream& os_; ///< Output stream. + std::string& separator_; ///< Separator: output preceding argument. +}; + +/** + * @brief Logging function + * + * @details + * log_arguments Log arguments to output file stream. Arguments + * are preceded by new line, and separated by separator. + * + * @param[in] + * ofs std::ofstream + * Open output stream file. + * + * @param[in] + * separator std::string + * Separator to print between arguments. + * + * @param[in] + * head + * First argument to log. It is preceded by newline. + * + * @param[in] + * xs + * Variadic parameter pack. Each argument in variadic + * parameter pack is logged, and it is preceded by + * separator. + */ +template +void log_arguments(std::ostream& os, std::string& separator, H head, Ts&... xs) +{ + os << "\n" << head; + each_args(log_arg{os, separator}, xs...); +} + +/** + * @brief Logging function + * + * @details + * log_arguments Log argument to output file stream. Argument + * is preceded by new line. + * + * @param[in] + * ofs std::ofstream + * open output stream file. + * + * @param[in] + * separator std::string + * Not used. + * + * @param[in] + * head + * Argument to log. It is preceded by newline. + */ +template +void log_argument(std::ostream& os, std::string& separator, H head) +{ + os << "\n" << head; +} + +/** + * @brief Logging function + * + * @details + * log_arguments Log argument to output file stream. Argument + * is preceded by new line. + * + * @param[in] + * ofs std::ofstream + * open output stream file. + * + * @param[in] + * head + * Argument to log. It is preceded by newline. + */ +template +void log_argument(std::ostream& os, H head) +{ + os << "\n" << head; +} + +#endif // ROCSPARSE_LOGGING_H_ diff --git a/library/src/include/status.h b/library/src/include/status.h new file mode 100644 index 00000000..c10fdbdd --- /dev/null +++ b/library/src/include/status.h @@ -0,0 +1,18 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +#ifndef ROCSPARSE_STATUS_H_ +#define ROCSPARSE_STATUS_H_ + +#include "rocsparse.h" + +#include + +/******************************************************************************* + * \brief convert hipError_t to rocblas_status + ******************************************************************************/ +rocsparseStatus_t get_rocsparse_status_for_hip_status(hipError_t status); + +#endif // ROCSPARSE_STATUS_H_ diff --git a/library/src/include/utility.h b/library/src/include/utility.h new file mode 100644 index 00000000..a4796009 --- /dev/null +++ b/library/src/include/utility.h @@ -0,0 +1,85 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#ifndef ROCSPARSE_UTILITY_H_ +#define ROCSPARSE_UTILITY_H_ + +#include "rocsparse.h" +#include "context.h" +#include "logging.h" + +#include +#include +#include + +// if trace logging is turned on with +// (handle->layer_mode & rocblas_layer_mode_log_trace) == true +// then +// log_function will call log_arguments to log function +// arguments with a comma separator +template +void log_trace(rocsparseHandle_t handle, H head, Ts&... xs) +{ + if(nullptr != handle) + { + if(handle->layer_mode & ROCSPARSE_LAYER_MODE_LOG_TRACE) + { + std::string comma_separator = ","; + + std::ostream* os = handle->log_trace_os; + log_arguments(*os, comma_separator, head, xs...); + } + } +} + +// if bench logging is turned on with +// (handle->layer_mode & rocblas_layer_mode_log_bench) == true +// then +// log_bench will call log_arguments to log a string that +// can be input to the executable rocblas-bench. +template +void log_bench(rocsparseHandle_t handle, H head, std::string precision, Ts&... xs) +{ + if(nullptr != handle) + { + if(handle->layer_mode & ROCSPARSE_LAYER_MODE_LOG_BENCH) + { + std::string space_separator = " "; + + std::ostream* os = handle->log_bench_os; + log_arguments(*os, space_separator, head, precision, xs...); + } + } +} + +// replaces X in string with s, d, c, z or h depending on typename T +template +std::string replaceX(std::string input_string) +{ + if(std::is_same::value) + { + std::replace(input_string.begin(), input_string.end(), 'X', 's'); + } + else if(std::is_same::value) + { + std::replace(input_string.begin(), input_string.end(), 'X', 'd'); + } +/* + else if(std::is_same::value) + { + std::replace(input_string.begin(), input_string.end(), 'X', 'c'); + } + else if(std::is_same::value) + { + std::replace(input_string.begin(), input_string.end(), 'X', 'z'); + } + else if(std::is_same::value) + { + std::replace(input_string.begin(), input_string.end(), 'X', 'h'); + } +*/ + return input_string; +} + +#endif // ROCSPARSE_UTILITY_H_ diff --git a/library/src/status.cpp b/library/src/status.cpp new file mode 100644 index 00000000..187dbcf9 --- /dev/null +++ b/library/src/status.cpp @@ -0,0 +1,46 @@ +/* ************************************************************************ + * Copyright 2016 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +#include "status.h" +#include "rocsparse.h" + +#include + +/******************************************************************************* + * \brief convert hipError_t to rocblas_status + * TODO - enumerate library calls to hip runtime, enumerate possible errors from those calls + ******************************************************************************/ +rocsparseStatus_t get_rocsparse_status_for_hip_status(hipError_t status) +{ + switch(status) + { + // success + case hipSuccess: + return ROCSPARSE_STATUS_SUCCESS; + + // internal hip memory allocation + case hipErrorMemoryAllocation: + case hipErrorLaunchOutOfResources: + return ROCSPARSE_STATUS_MEMORY_ERROR; + + // user-allocated hip memory + case hipErrorInvalidDevicePointer: // hip memory + return ROCSPARSE_STATUS_INVALID_POINTER; + + // user-allocated device, stream, event + case hipErrorInvalidDevice: + case hipErrorInvalidResourceHandle: + return ROCSPARSE_STATUS_INVALID_HANDLE; + + // library using hip incorrectly + case hipErrorInvalidValue: + return ROCSPARSE_STATUS_INTERNAL_ERROR; + + // hip runtime failing + case hipErrorNoDevice: // no hip devices + case hipErrorUnknown: + default: return ROCSPARSE_STATUS_INTERNAL_ERROR; + } +} From 24ac8fe790c29a604ca9034d0ffcc81aee0a2f2d Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 13:59:07 +0200 Subject: [PATCH 011/304] added logging and csr-vector kernel --- library/src/CMakeLists.txt | 1 + library/src/context.cpp | 41 ++++- library/src/include/context.h | 2 + library/src/level2/csrmv_device.h | 199 ++++++++++++++++++++++ library/src/level2/rocsparse_csrmv.cpp | 224 ++++++++++++++++++++++++- 5 files changed, 463 insertions(+), 4 deletions(-) create mode 100644 library/src/level2/csrmv_device.h diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 217efc97..be5fa92d 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -6,6 +6,7 @@ set(rocsparse_source src/context.cpp src/matrix.cpp + src/status.cpp src/level2/rocsparse_csrmv.cpp diff --git a/library/src/context.cpp b/library/src/context.cpp index ce053969..3658db49 100644 --- a/library/src/context.cpp +++ b/library/src/context.cpp @@ -2,22 +2,58 @@ * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ +#include "definitions.h" #include "context.h" #include "rocsparse.h" - +#include "utility.h" rocsparseContext::rocsparseContext() { + // Default device is active device + THROW_IF_HIP_ERROR(hipGetDevice(&device)); + THROW_IF_HIP_ERROR(hipGetDeviceProperties(&properties, device)); // Default is system stream stream = 0; // Default pointer mode is host pointer_mode = ROCSPARSE_POINTER_MODE_HOST; + // Device warp size + warp_size = properties.warpSize; + // Layer mode + char *str_layer_mode; + if ((str_layer_mode = getenv("ROCSPARSE_LAYER")) == NULL) + { + layer_mode = ROCSPARSE_LAYER_MODE_NONE; + } + else + { + layer_mode = (rocsparseLayerMode_t) (atoi(str_layer_mode)); + } + + // Open log file + if (layer_mode & ROCSPARSE_LAYER_MODE_LOG_TRACE) + { + open_log_stream(&log_trace_os, &log_trace_ofs, "ROCSPARSE_LOG_TRACE_PATH"); + } + + // Open log_bench file + if (layer_mode & ROCSPARSE_LAYER_MODE_LOG_BENCH) + { + open_log_stream(&log_bench_os, &log_bench_ofs, "ROCSPARSE_LOG_BENCH_PATH"); + } } rocsparseContext::~rocsparseContext() { + if (log_trace_ofs.is_open()) + { + log_trace_ofs.close(); + } + if (log_bench_ofs.is_open()) + { + log_bench_ofs.close(); + } } @@ -35,6 +71,8 @@ extern "C" rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) try { *handle = new rocsparseContext; + log_trace(*handle, "rocsparseCreate"); + } catch(rocsparseStatus_t status) { @@ -46,6 +84,7 @@ extern "C" rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) extern "C" rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) { + log_trace(handle, "rocsparseDestroy"); // Destruct try { diff --git a/library/src/include/context.h b/library/src/include/context.h index 5eb5b877..28e029cd 100644 --- a/library/src/include/context.h +++ b/library/src/include/context.h @@ -23,6 +23,8 @@ struct rocsparseContext int device; // device properties hipDeviceProp_t properties; + // device warp size + int warp_size; // stream hipStream_t stream; // pointer mode diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h new file mode 100644 index 00000000..cc74820e --- /dev/null +++ b/library/src/level2/csrmv_device.h @@ -0,0 +1,199 @@ +#include + +// Knuth's Two-Sum algorithm, which allows us to add together two floating +// point numbers and exactly tranform the answer into a sum and a +// rounding error. +// Inputs: x and y, the two inputs to be aded together. +// In/Out: *sumk_err, which is incremented (by reference) -- holds the +// error value as a result of the 2sum calculation. +// Returns: The non-corrected sum of inputs x and y. +template +static __device__ +T two_sum(T x, T y, T *sumk_err) +{ + const T sumk_s = x + y; +#ifdef EXTENDED_PRECISION + // We use this 2Sum algorithm to perform a compensated summation, + // which can reduce the cummulative rounding errors in our SpMV summation. + // Our compensated sumation is based on the SumK algorithm (with K==2) from + // Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in + // SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005. + + // 2Sum can be done in 6 FLOPs without a branch. However, calculating + // double precision is slower than single precision on every existing GPU. + // As such, replacing 2Sum with Fast2Sum when using DPFP results in slightly + // better performance. This is especially true on non-workstation GPUs with + // low DPFP rates. Fast2Sum is faster even though we must ensure that + // |a| > |b|. Branch divergence is better than the DPFP slowdown. + // Thus, for DPFP, our compensated summation algorithm is actually described + // by both Pichat and Neumaier in "Correction d'une somme en arithmetique + // a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972) + // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher + // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51, + // 1974), respectively. + if (fabs(x) < fabs(y)) + { + const T swap = x; + x = y; + y = swap; + } + (*sumk_err) += (y - (sumk_s - x)); + // Original 6 FLOP 2Sum algorithm. + //T bp = sumk_s - x; + //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp)); +#endif + return sumk_s; +} + +// Performs (x_vals * x_vec) + y using an FMA. +// Ideally, we would perform an error-free transformation here and return the +// appropriate error. However, the EFT of an FMA is very expensive. As such, +// if we are in EXTENDED_PRECISION mode, this function devolves into two_sum +// with x_vals and x_vec inputs multiplied separately from the compensated add. +template +static __device__ +T two_fma(T x_vals, T x_vec, T y, T *sumk_err) +{ +#ifdef EXTENDED_PRECISION + T x = x_vals * x_vec; + const T sumk_s = x + y; + if (fabs(x) < fabs(y)) + { + const T swap = x; + x = y; + y = swap; + } + (*sumk_err) += (y - (sumk_s - x)); + // 2Sum in the FMA case. Poor performance on low-DPFP GPUs. + //const T bp = fma(-x_vals, x_vec, sumk_s); + //(*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp)); + return sumk_s; +#else + return fma(x_vals, x_vec, y); +#endif +} + +// A method of doing the final reduction without having to copy and paste +// it a bunch of times. +// The EXTENDED_PRECISION section is done as part of the PSum2 addition, +// where we take temporary sums and errors for multiple threads and combine +// them together using the same 2Sum method. +// Inputs: cur_sum: the input from which our sum starts +// err: the current running cascade error for this final summation +// partial: the local memory which holds the values to sum +// (we eventually use it to pass down temp. err vals as well) +// lid: local ID of the work item calling this function. +// thread_lane: The lane within this SUBWAVE for reduction. +// round: This parallel summation method operates in multiple rounds +// to do a parallel reduction. See the blow comment for usage. +template +static __device__ +T sum2_reduce(T cur_sum, T *err, + volatile T *partial, + int lid, + int thread_lane, + int round) +{ + if (SUBWAVE_SIZE > round) + { +#ifdef EXTENDED_PRECISION + const unsigned int partial_dest = lid + round; + if (thread_lane < round) + cur_sum = two_sum(cur_sum, partial[partial_dest], err); + // We reuse the LDS entries to move the error values down into lower + // threads. This saves LDS space, allowing higher occupancy, but requires + // more barriers, which can reduce performance. + __syncthreads(); + // Have all of those upper threads pass their temporary errors + // into a location that the lower threads can read. + if (thread_lane >= round) + partial[lid] = *err; + __syncthreads(); + if (thread_lane < round) { // Add those errors in. + *err += partial[partial_dest]; + partial[lid] = cur_sum; + } +#else + // This is the more traditional reduction algorithm. It is up to + // 25% faster (about 10% on average -- potentially worse on devices + // with low double-precision calculation rates), but can result in + // numerical inaccuracies, especially in single precision. + cur_sum += partial[lid + round]; + __syncthreads(); + partial[lid] = cur_sum; +#endif + } + return cur_sum; +} + +// Uses macro constants: +// WAVE_SIZE - "warp size", typically 64 (AMD) or 32 (NV) +// WG_SIZE - workgroup ("block") size, 1D representation assumed +// int - typename for the type of integer data read by the kernel, usually unsigned int +// T - typename for the type of floating point data, usually double +// SUBWAVE_SIZE - the length of a "sub-wave", a power of 2, i.e. 1,2,4,...,WAVE_SIZE, assigned to process a single matrix row +template +static __device__ +//__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void csrmvn_general_device(int num_rows, + T alpha, + const int *row_offset, + const int *col, + const T *val, + const T *x, + T beta, + T *y) +{ + __shared__ volatile T sdata [WG_SIZE + SUBWAVE_SIZE / 2]; + + //const int vectors_per_block = WG_SIZE/SUBWAVE_SIZE; + const int global_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // global workitem id + const int local_id = hipThreadIdx_x; // local workitem id + const int thread_lane = local_id & (SUBWAVE_SIZE - 1); + const int vector_id = global_id / SUBWAVE_SIZE; // global vector id + //const int vector_lane = local_id / SUBWAVE_SIZE; // vector id within the workgroup + const int num_vectors = hipGridDim_x * hipBlockDim_x / SUBWAVE_SIZE; + + for(int row = vector_id; row < num_rows; row += num_vectors) + { + const int row_start = row_offset[row]; + const int row_end = row_offset[row+1]; + T sum = 0.; + + T sumk_e = 0.; + // It is about 5% faster to always multiply by alpha, rather than to + // check whether alpha is 0, 1, or other and do different code paths. + for(int j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE) + sum = two_fma(alpha * val[j], x[col[j]], sum, &sumk_e); + T new_error = 0.; + sum = two_sum(sum, sumk_e, &new_error); + + // Parallel reduction in shared memory. + sdata[local_id] = sum; + + // This compensated summation reduces cummulative rounding errors, + // which can become a problem on GPUs because our reduction order is + // different than what would be used on a CPU. + // It is based on the PSumK algorithm (with K==2) from + // Yamanaka, Ogita, Rump, and Oishi, "A Parallel Algorithm of + // Accurate Dot Product," in the Journal of Parallel Computing, + // 34(6-8), pp. 392-410, Jul. 2008. + #pragma unroll + for (int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + __syncthreads(); + sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, i); + } + + if (thread_lane == 0) + { + if (beta == 0) + y[row] = sum + new_error; + else + { + sum = two_fma(beta, y[row], sum, &new_error); + y[row] = sum + new_error; + } + } + } +} diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 3563c936..506d7d1e 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -4,7 +4,29 @@ #include "rocsparse.h" #include "context.h" +#include "utility.h" #include "matrix.h" +#include "csrmv_device.h" + +#include + +template +__global__ +void csrmvn_kernel_host_pointer(int m, T alpha, const int *ptr, const int *col, + const T *val, const T *x, T beta, T *y) +{ + csrmvn_general_device( + m, alpha, ptr, col, val, x, beta, y); +} + +template +__global__ +void csrmvn_kernel_device_pointer(int m, const T *alpha, const int *ptr, const int *col, + const T *val, const T *x, const T *beta, T *y) +{ + csrmvn_general_device( + m, *alpha, ptr, col, val, x, *beta, y); +} template rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, @@ -31,10 +53,36 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, return ROCSPARSE_STATUS_NOT_INITIALIZED; } - // Logging + // Logging TODO bench logging if (handle->pointer_mode == ROCSPARSE_POINTER_MODE_HOST) { - // TODO + log_trace(handle, + replaceX("rocsparse_Xcsrmv"), + transA, + m, n, nnz, + *alpha, + (const void*&) descrA, + (const void*&) csrValA, + (const void*&) csrRowPtrA, + (const void*&) csrColIndA, + (const void*&) x, + *beta, + (const void*&) y); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xcsrmv"), + transA, + m, n, nnz, + (const void*&) alpha, + (const void*&) descrA, + (const void*&) csrValA, + (const void*&) csrRowPtrA, + (const void*&) csrColIndA, + (const void*&) x, + (const void*&) beta, + (const void*&) y); } // Check matrix type @@ -106,7 +154,177 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, // Run different csrmv kernels if (transA == ROCSPARSE_OPERATION_NON_TRANSPOSE) { - // TODO +#define CSRMVN_DIM 512 + + int nnz_per_row = nnz / m; + + dim3 csrmvn_blocks((m-1)/CSRMVN_DIM+1); + dim3 csrmvn_threads(CSRMVN_DIM); + + if (handle->pointer_mode == ROCSPARSE_POINTER_MODE_DEVICE) + { + if (handle->warp_size == 32) + { + if (nnz_per_row < 4) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + else if (nnz_per_row < 8) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + else if (nnz_per_row < 16) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + else if (nnz_per_row < 32) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + else + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + } + else if (handle->warp_size == 64) + { + if (nnz_per_row < 4) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + else if (nnz_per_row < 8) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + else if (nnz_per_row < 16) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + else if (nnz_per_row < 32) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + else if (nnz_per_row < 64) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + else + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + } + } + else + { + return ROCSPARSE_STATUS_ARCH_MISMATCH; + } + } + else + { + if (*alpha == 0.0 && *beta == 1.0) + { + return ROCSPARSE_STATUS_SUCCESS; + } + + if (handle->warp_size == 32) + { + if (nnz_per_row < 4) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + else if (nnz_per_row < 8) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + else if (nnz_per_row < 16) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + else if (nnz_per_row < 32) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + else + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + } + else if (handle->warp_size == 64) + { + if (nnz_per_row < 4) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + else if (nnz_per_row < 8) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + else if (nnz_per_row < 16) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + else if (nnz_per_row < 32) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + else if (nnz_per_row < 64) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + else + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, csrmvn_threads, 0, stream, + m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + } + } + else + { + return ROCSPARSE_STATUS_ARCH_MISMATCH; + } + } +#undef CSRMVN_DIM } else { From 9b0540f098c1a617cc9318aaf0149a06f4a47b77 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 13:59:42 +0200 Subject: [PATCH 012/304] csrmv test updated to allow tolerances --- test/test_rocsparse_csrmv.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_rocsparse_csrmv.cpp b/test/test_rocsparse_csrmv.cpp index ec5c8a32..f1181d58 100644 --- a/test/test_rocsparse_csrmv.cpp +++ b/test/test_rocsparse_csrmv.cpp @@ -78,7 +78,8 @@ TEST(Tests, rocsparseScsrmv) { sum += alpha * Aval[j] * x[Acol[j]]; } - ASSERT_NEAR(result[i], sum, 0.0); + float eps = std::max(fabs(sum)*1e-4f, 1e-7); + ASSERT_NEAR(result[i], sum, eps); } @@ -153,7 +154,8 @@ TEST(Tests, rocsparseDcsrmv) { sum += alpha * Aval[j] * x[Acol[j]]; } - ASSERT_NEAR(result[i], sum, 0.0); + double eps = std::max(fabs(sum) * 1e-8, 1e-15); + ASSERT_NEAR(result[i], sum, eps); } From 27e49207cd34cc39b23bf1a374bfca4c0aa86ce6 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 17 Apr 2018 16:38:46 +0200 Subject: [PATCH 013/304] bunch of test scenarios for csrmv --- CMakeLists.txt | 16 + test/test_rocsparse_csrmv.cpp | 1474 ++++++++++++++++++++++++++++++++- test/test_utils.h | 195 +++++ 3 files changed, 1684 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 504e8a75..e91d045a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,6 +104,22 @@ if(BUILD_TEST) ) endif() find_package(GTest REQUIRED) + # Download some test matrices + set(TEST_MATRICES + nos1 + nos2 + nos3 + nos4 + nos5 + nos6 + nos7 + ) + foreach(m ${TEST_MATRICES}) + file(DOWNLOAD ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/lanpro/${m}.mtx.gz + ${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx.gz) + execute_process(COMMAND gzip -d -f ${m}.mtx.gz + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) + endforeach() endif() # Benchmark dependencies diff --git a/test/test_rocsparse_csrmv.cpp b/test/test_rocsparse_csrmv.cpp index f1181d58..5b58766b 100644 --- a/test/test_rocsparse_csrmv.cpp +++ b/test/test_rocsparse_csrmv.cpp @@ -78,7 +78,7 @@ TEST(Tests, rocsparseScsrmv) { sum += alpha * Aval[j] * x[Acol[j]]; } - float eps = std::max(fabs(sum)*1e-4f, 1e-7); + float eps = std::max(fabs(sum)*1e-4f, 1e-6); ASSERT_NEAR(result[i], sum, eps); } @@ -161,3 +161,1475 @@ TEST(Tests, rocsparseDcsrmv) ROCSPARSE_CHECK(rocsparseDestroy(handle)); } + +TEST(Tests, rocsparseScsrmv_nos1) +{ + rocsparseHandle_t handle; + ROCSPARSE_CHECK(rocsparseCreate(&handle)); + + int nrow; + int ncol; + int nnz; + + int *coo_row = NULL; + int *coo_col = NULL; + float *coo_val = NULL; + + // Read matrix from MTX file + ASSERT_EQ(readMatrixFromMTX("../matrices/nos1.mtx", + nrow, ncol, nnz, + &coo_row, &coo_col, &coo_val), 0); + ASSERT_EQ(nrow, 237); + + int *Aptr = NULL; + int *Acol = NULL; + float *Aval = NULL; + + // Convert matrix to CSR + coo_to_csr(nrow, ncol, nnz, coo_row, coo_col, coo_val, &Aptr, &Acol, &Aval); + + // Clean up COO structure + free(coo_row); + free(coo_col); + free(coo_val); + + // Sample some random data + srand(12345ULL); + + float alpha = (float) rand() / RAND_MAX; + float beta = (float) rand() / RAND_MAX; + + float *x = (float*) malloc(sizeof(float)*nrow); + float *y = (float*) malloc(sizeof(float)*nrow); + for (int i=0; i +#include +#include +#include template inline int gen2DLaplacianUS(int ndim, int **rowptr, int **col, T **val) @@ -66,4 +69,196 @@ inline int gen2DLaplacianUS(int ndim, int **rowptr, int **col, T **val) return n; } +template +inline int readMatrixFromMTX(const char *filename, + int &nrow, int &ncol, int &nnz, + int **row, int **col, T **val) +{ + FILE *f = fopen(filename, "r"); + if (!f) + { + return -1; + } + + char line[1024]; + + // Check for banner + if (!fgets(line, 1024, f)) + { + return -1; + } + + char banner[16]; + char array[16]; + char coord[16]; + char data[16]; + char type[16]; + + // Extract banner + if (sscanf(line, "%s %s %s %s %s", banner, array, coord, data, type) != 5) + { + return -1; + } + + // Convert to lower case + for (char *p=array; *p!='\0'; *p=tolower(*p), p++); + for (char *p=coord; *p!='\0'; *p=tolower(*p), p++); + for (char *p=data; *p!='\0'; *p=tolower(*p), p++); + for (char *p=type; *p!='\0'; *p=tolower(*p), p++); + + // Check banner + if (strncmp(line, "%%MatrixMarket", 14) != 0) + { + return -1; + } + + // Check array type + if (strcmp(array, "matrix") != 0) + { + return -1; + } + + // Check coord + if (strcmp(coord, "coordinate") != 0) + { + return -1; + } + + // Check data + if (strcmp(data, "real") != 0) + { + return -1; + } + + // Check type + if (strcmp(type, "general") != 0 && + strcmp(type, "symmetric") != 0) + { + return -1; + } + + // Symmetric flag + int symm = !strcmp(type, "symmetric"); + + // Skip comments + while(fgets(line, 1024, f)) + { + if (line[0] != '%') + { + break; + } + } + + // Read dimensions + int snnz; + + sscanf(line, "%d %d %d", &nrow, &ncol, &snnz); + nnz = symm ? (snnz - nrow) * 2 + nrow : snnz; + + *row = (int*) malloc(sizeof(int)*nnz); + *col = (int*) malloc(sizeof(int)*nnz); + *val = (T*) malloc(sizeof(T)*nnz); + + // Read entries + int idx = 0; + while(fgets(line, 1024, f)) + { + int irow; + int icol; + double dval; + + sscanf(line, "%d %d %lf", &irow, &icol, &dval); + + --irow; + --icol; + + (*row)[idx] = irow; + (*col)[idx] = icol; + (*val)[idx] = (T) dval; + + ++idx; + + if (symm && irow != icol) { + + (*row)[idx] = icol; + (*col)[idx] = irow; + (*val)[idx] = (T) dval; + + ++idx; + + } + + } + + fclose(f); + + return 0; +} + +template +inline void coo_to_csr(int nrow, int ncol, int nnz, + const int *src_row, const int *src_col, const T *src_val, + int **dst_ptr, int **dst_col, T **dst_val) +{ + *dst_ptr = (int*) malloc(sizeof(int)*(nrow+1)); + *dst_col = (int*) malloc(sizeof(int)*nnz); + *dst_val = (T*) malloc(sizeof(T)*nnz); + + memset(*dst_ptr, 0, sizeof(int)*(nrow+1)); + + // Compute nnz entries per row + for (int i=0; i Date: Wed, 18 Apr 2018 09:20:44 +0200 Subject: [PATCH 014/304] cmake fix: do not download already available matrices --- CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e91d045a..315bf894 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,10 +115,12 @@ if(BUILD_TEST) nos7 ) foreach(m ${TEST_MATRICES}) - file(DOWNLOAD ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/lanpro/${m}.mtx.gz - ${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx.gz) - execute_process(COMMAND gzip -d -f ${m}.mtx.gz - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) + if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx") + file(DOWNLOAD ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/lanpro/${m}.mtx.gz + ${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx.gz) + execute_process(COMMAND gzip -d -f ${m}.mtx.gz + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) + endif() endforeach() endif() From d25d00e212f099c28df833b9aeecc0d43eda44d2 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 18 Apr 2018 09:23:11 +0200 Subject: [PATCH 015/304] only single util file for benchmarks and tests ; fixes in test/benchmarks ; new csrmv benchmark for matrix market input added --- benchmark/CMakeLists.txt | 5 + benchmark/benchmark_csrmv_mtx.cpp | 256 +++++++++++++++++++++++++++++ benchmark/benchmark_spmv.cpp | 8 +- benchmark/benchmark_utils.h | 69 -------- test/CMakeLists.txt | 1 + test/test_rocsparse_csrmv.cpp | 42 ++++- test/test_utils.h => utils/utils.h | 6 +- 7 files changed, 310 insertions(+), 77 deletions(-) create mode 100644 benchmark/benchmark_csrmv_mtx.cpp delete mode 100644 benchmark/benchmark_utils.h rename test/test_utils.h => utils/utils.h (98%) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 4e053f26..ad985896 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -6,6 +6,10 @@ function(add_rocsparse_benchmark BENCHMARK_SOURCE) get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) + target_include_directories(${BENCHMARK_TARGET} SYSTEM + PUBLIC + ${CMAKE_SOURCE_DIR}/utils + ) if(HIP_PLATFORM STREQUAL "hcc") target_link_libraries(${BENCHMARK_TARGET} PRIVATE @@ -28,3 +32,4 @@ endfunction() # Benchmarks add_rocsparse_benchmark(benchmark_spmv.cpp) +add_rocsparse_benchmark(benchmark_csrmv_mtx.cpp) diff --git a/benchmark/benchmark_csrmv_mtx.cpp b/benchmark/benchmark_csrmv_mtx.cpp new file mode 100644 index 00000000..a3568957 --- /dev/null +++ b/benchmark/benchmark_csrmv_mtx.cpp @@ -0,0 +1,256 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "benchmark/benchmark.h" +#include "utils.h" + +#include +#include + +#define HIP_CHECK(stat) \ +{ \ + hipError_t err = stat; \ + if (err != hipSuccess) \ + { \ + fprintf(stderr, "HIP error: %d line: %d\n", err, __LINE__); \ + exit(stat); \ + } \ +} + +#define ROCSPARSE_CHECK(stat) \ +{ \ + rocsparseStatus_t err = stat; \ + if (err != ROCSPARSE_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "ROCSPARSE error: %d line: %d\n", err, __LINE__); \ + exit(stat); \ + } \ +} + +void csrmv(rocsparseHandle_t handle, rocsparseOperation_t trans, + int nrow, int ncol, int nnz, const float *alpha, + rocsparseMatDescr_t descrA, const float *csrValA, + const int *csrRowPtrA, const int *csrColIndA, + const float *x, const float *beta, float *y) +{ + ROCSPARSE_CHECK(rocsparseScsrmv(handle, trans, nrow, ncol, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y)); +} + +void csrmv(rocsparseHandle_t handle, rocsparseOperation_t trans, + int nrow, int ncol, int nnz, const double *alpha, + rocsparseMatDescr_t descrA, const double *csrValA, + const int *csrRowPtrA, const int *csrColIndA, + const double *x, const double *beta, double *y) +{ + ROCSPARSE_CHECK(rocsparseDcsrmv(handle, trans, nrow, ncol, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y)); +} + +template +void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, + rocsparseHandle_t handle, rocsparseOperation_t trans, + int nrow, int ncol, int nnz, rocsparseMatDescr_t descr, + const ValueType *alpha, const ValueType *csrValA, + const int *csrRowPtrA, const int *csrColIndA, + const ValueType *x, const ValueType *beta, ValueType *y) +{ + // Warm up + for (int i=0; i<10; ++i) + { + csrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + nrow, ncol, nnz, alpha, descr, csrValA, + csrRowPtrA, csrColIndA, x, beta, y); + } + HIP_CHECK(hipDeviceSynchronize()); + + for (auto _:state) + { + auto start = std::chrono::high_resolution_clock::now(); + + for (size_t i=0; i >(end-start); + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations()*batch* + (sizeof(ValueType)*(2*nrow+nnz)+sizeof(int)*(nrow+1+nnz))); + state.SetItemsProcessed(state.iterations()*batch*2*nnz); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) + { + fprintf(stderr, "%s [ ]\n", argv[0]); + return -1; + } + + int trials = 200; + int batch_size = 1; + + // Parse command line + if (argc > 2) + { + trials = atoi(argv[2]); + } + if (argc > 3) + { + batch_size = atoi(argv[3]); + } + + // rocSPARSE handle + rocsparseHandle_t handle; + ROCSPARSE_CHECK(rocsparseCreate(&handle)); + + benchmark::Initialize(&argc, argv); + + hipStream_t stream = 0; + hipDeviceProp_t devProp; + int device_id = 0; + + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + printf("[HIP] Device name: %s\n", devProp.name); + + // Read matrix from file + int nrow; + int ncol; + int nnz; + + int *coo_row = NULL; + int *coo_col = NULL; + double *coo_val = NULL; + + if (readMatrixFromMTX(argv[1], nrow, ncol, nnz, + &coo_row, &coo_col, &coo_val) != 0) + { + fprintf(stderr, "Cannot read MTX file %s\n", argv[1]); + return -1; + } + printf("[MTX] %d x %d matrix with %d nnz\n", nrow, ncol, nnz); + + // Convert to CSR (host) TODO + int *Aptr = NULL; + int *Acol = NULL; + float *Avalf = NULL; + double *Avald = NULL; + + coo_to_csr(nrow, ncol, nnz, coo_row, coo_col, coo_val, + &Aptr, &Acol, &Avald); + + Avalf = (float*) malloc(sizeof(float)*nnz); + for (int i=0; i benchmarks = + { + benchmark::RegisterBenchmark("rocsparseScsrmv", run_benchmark, + stream, batch_size, + handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + nrow, nrow, nnz, descrA, &alphaf, dAvalf, + dAptr, dAcol, dxf, &betaf, dyf), + benchmark::RegisterBenchmark("rocsparseDcsrmv", run_benchmark, + stream, batch_size, + handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + nrow, nrow, nnz, descrA, &alphad, dAvald, + dAptr, dAcol, dxd, &betad, dyd) + }; + + for (auto& b:benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + b->Iterations(trials); + } + + benchmark::RunSpecifiedBenchmarks(); + + // Clear up on device + HIP_CHECK(hipFree(dAptr)); + HIP_CHECK(hipFree(dAcol)); + HIP_CHECK(hipFree(dAvalf)); + HIP_CHECK(hipFree(dAvald)); + HIP_CHECK(hipFree(dxf)); + HIP_CHECK(hipFree(dxd)); + HIP_CHECK(hipFree(dyf)); + HIP_CHECK(hipFree(dyd)); + + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); + ROCSPARSE_CHECK(rocsparseDestroy(handle)); + + return 0; +} diff --git a/benchmark/benchmark_spmv.cpp b/benchmark/benchmark_spmv.cpp index edd0fc79..2f4a6ae1 100644 --- a/benchmark/benchmark_spmv.cpp +++ b/benchmark/benchmark_spmv.cpp @@ -3,7 +3,7 @@ * ************************************************************************ */ #include "benchmark/benchmark.h" -#include "benchmark_utils.h" +#include "utils.h" #include #include @@ -96,15 +96,15 @@ int main(int argc, char *argv[]) int batch_size = 1; // Parse command line - if (argc > 2) + if (argc > 1) { ndim = atoi(argv[1]); } - if (argc > 3) + if (argc > 2) { trials = atoi(argv[2]); } - if (argc > 4) + if (argc > 3) { batch_size = atoi(argv[3]); } diff --git a/benchmark/benchmark_utils.h b/benchmark/benchmark_utils.h deleted file mode 100644 index 76d2a04e..00000000 --- a/benchmark/benchmark_utils.h +++ /dev/null @@ -1,69 +0,0 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#ifndef ROCSPARSE_BENCHMARK_UTILS_H_ -#define ROCSPARSE_BENCHMARK_UTILS_H_ - -#include - -template -inline int gen2DLaplacianUS(int ndim, int **rowptr, int **col, T **val) -{ - - int n = ndim * ndim; - int nnz_mat = n * 5 - ndim * 4; - - *rowptr = (int*) malloc((n+1)*sizeof(int)); - *col = (int*) malloc(nnz_mat*sizeof(int)); - *val = (T*) malloc(nnz_mat*sizeof(T)); - - int nnz = 0; - - // Fill local arrays - for (int i=0; i #include @@ -82,7 +82,20 @@ TEST(Tests, rocsparseScsrmv) ASSERT_NEAR(result[i], sum, eps); } + free(Aptr); + free(Acol); + free(Aval); + free(x); + free(y); + free(result); + + HIP_CHECK(hipFree(dAptr)); + HIP_CHECK(hipFree(dAcol)); + HIP_CHECK(hipFree(dAval)); + HIP_CHECK(hipFree(dx)); + HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -158,7 +171,20 @@ TEST(Tests, rocsparseDcsrmv) ASSERT_NEAR(result[i], sum, eps); } + free(Aptr); + free(Acol); + free(Aval); + free(x); + free(y); + free(result); + + HIP_CHECK(hipFree(dAptr)); + HIP_CHECK(hipFree(dAcol)); + HIP_CHECK(hipFree(dAval)); + HIP_CHECK(hipFree(dx)); + HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -264,6 +290,7 @@ TEST(Tests, rocsparseScsrmv_nos1) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -369,6 +396,7 @@ TEST(Tests, rocsparseScsrmv_nos2) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -474,6 +502,7 @@ TEST(Tests, rocsparseScsrmv_nos3) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -579,6 +608,7 @@ TEST(Tests, rocsparseScsrmv_nos4) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -684,6 +714,7 @@ TEST(Tests, rocsparseScsrmv_nos5) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -789,6 +820,7 @@ TEST(Tests, rocsparseScsrmv_nos6) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -894,6 +926,7 @@ TEST(Tests, rocsparseScsrmv_nos7) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -999,6 +1032,7 @@ TEST(Tests, rocsparseDcsrmv_nos1) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -1104,6 +1138,7 @@ TEST(Tests, rocsparseDcsrmv_nos2) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -1209,6 +1244,7 @@ TEST(Tests, rocsparseDcsrmv_nos3) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -1314,6 +1350,7 @@ TEST(Tests, rocsparseDcsrmv_nos4) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -1419,6 +1456,7 @@ TEST(Tests, rocsparseDcsrmv_nos5) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -1524,6 +1562,7 @@ TEST(Tests, rocsparseDcsrmv_nos6) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } @@ -1629,6 +1668,7 @@ TEST(Tests, rocsparseDcsrmv_nos7) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); + ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); ROCSPARSE_CHECK(rocsparseDestroy(handle)); } diff --git a/test/test_utils.h b/utils/utils.h similarity index 98% rename from test/test_utils.h rename to utils/utils.h index a88d0e99..92fd98ed 100644 --- a/test/test_utils.h +++ b/utils/utils.h @@ -2,8 +2,8 @@ * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ -#ifndef ROCSPARSE_TEST_UTILS_H_ -#define ROCSPARSE_TEST_UTILS_H_ +#ifndef ROCSPARSE_UTILS_H_ +#define ROCSPARSE_UTILS_H_ #include #include @@ -261,4 +261,4 @@ inline void coo_to_csr(int nrow, int ncol, int nnz, } } -#endif // ROCSPARSE_TEST_UTILS_H_ +#endif // ROCSPARSE_UTILS_H_ From 83b45f0a469e034c01b12acaef71931d38f70b0b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 18 Apr 2018 10:08:18 +0200 Subject: [PATCH 016/304] pointer mode manipulation functionality --- library/include/rocsparse.h | 16 +++++ library/src/CMakeLists.txt | 1 + library/src/context.cpp | 42 ------------ library/src/level2/rocsparse_csrmv.cpp | 22 +++---- library/src/rocsparse_auxiliary.cpp | 89 ++++++++++++++++++++++++++ 5 files changed, 117 insertions(+), 53 deletions(-) create mode 100644 library/src/rocsparse_auxiliary.cpp diff --git a/library/include/rocsparse.h b/library/include/rocsparse.h index 5aa3be07..21cca21f 100644 --- a/library/include/rocsparse.h +++ b/library/include/rocsparse.h @@ -110,6 +110,22 @@ rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA); ROCSPARSE_EXPORT rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA); +/******************************************************************************** + * \brief Indicates whether the scalar value pointers are on the host or device. + * Set pointer mode, can be host or device + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseSetPointerMode(rocsparseHandle_t handle, + rocsparsePointerMode_t mode); +/******************************************************************************** + * \brief Get pointer mode, can be host or device. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, + rocsparsePointerMode_t *mode); + + + /* * =========================================================================== * level 1 SPARSE diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index be5fa92d..3cf082a1 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -9,5 +9,6 @@ set(rocsparse_source src/status.cpp + src/rocsparse_auxiliary.cpp src/level2/rocsparse_csrmv.cpp ) diff --git a/library/src/context.cpp b/library/src/context.cpp index 3658db49..95a48258 100644 --- a/library/src/context.cpp +++ b/library/src/context.cpp @@ -7,7 +7,6 @@ #include "rocsparse.h" #include "utility.h" - rocsparseContext::rocsparseContext() { // Default device is active device @@ -55,44 +54,3 @@ rocsparseContext::~rocsparseContext() log_bench_ofs.close(); } } - - - -extern "C" rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) -{ - // Check if handle is valid - if (handle == nullptr) - { - return ROCSPARSE_STATUS_INVALID_POINTER; - } - else - { - // Allocate - try - { - *handle = new rocsparseContext; - log_trace(*handle, "rocsparseCreate"); - - } - catch(rocsparseStatus_t status) - { - return status; - } - return ROCSPARSE_STATUS_SUCCESS; - } -} - -extern "C" rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) -{ - log_trace(handle, "rocsparseDestroy"); - // Destruct - try - { - delete handle; - } - catch(rocsparseStatus_t status) - { - return status; - } - return ROCSPARSE_STATUS_SUCCESS; -} diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 506d7d1e..80ab89cf 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -167,31 +167,31 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, { if (nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } else if (nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } else if (nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } else if (nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } else { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } @@ -200,37 +200,37 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, { if (nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } else if (nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } else if (nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } else if (nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } else if (nnz_per_row < 64) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } else { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); } diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp new file mode 100644 index 00000000..47656fc2 --- /dev/null +++ b/library/src/rocsparse_auxiliary.cpp @@ -0,0 +1,89 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "context.h" +#include "rocsparse.h" +#include "utility.h" + +/******************************************************************************** + * \brief rocsparseHandle_t is a structure holding the rocsparse library context. + * It must be initialized using rocsparseCreate() + * and the returned handle must be passed + * to all subsequent library function calls. + * It should be destroyed at the end using rocsparseDestroy(). + *******************************************************************************/ +extern "C" rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) +{ + // Check if handle is valid + if (handle == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + else + { + // Allocate + try + { + *handle = new rocsparseContext; + log_trace(*handle, "rocsparseCreate"); + + } + catch(rocsparseStatus_t status) + { + return status; + } + return ROCSPARSE_STATUS_SUCCESS; + } +} + +/******************************************************************************** + * \brief destroy handle + *******************************************************************************/ +extern "C" rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) +{ + log_trace(handle, "rocsparseDestroy"); + // Destruct + try + { + delete handle; + } + catch(rocsparseStatus_t status) + { + return status; + } + return ROCSPARSE_STATUS_SUCCESS; +} + +/******************************************************************************** + * \brief Indicates whether the scalar value pointers are on the host or device. + * Set pointer mode, can be host or device + *******************************************************************************/ +extern "C" rocsparseStatus_t rocsparseSetPointerMode(rocsparseHandle_t handle, + rocsparsePointerMode_t mode) +{ + // Check if handle is valid + if (handle == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + handle->pointer_mode = mode; + log_trace(handle, "rocsparseSetPointerMode", mode); + return ROCSPARSE_STATUS_SUCCESS; +} + +/******************************************************************************** + * \brief Get pointer mode, can be host or device. + *******************************************************************************/ +extern "C" rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, + rocsparsePointerMode_t *mode) +{ + // Check if handle is valid + if (handle == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + *mode = handle->pointer_mode; + log_trace(handle, "rocsparseGetPointerMode", *mode); + return ROCSPARSE_STATUS_SUCCESS; +} From 9fbd48d74dae654e8b9a62e3e8a8c1cf6eb9d973 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 18 Apr 2018 10:20:50 +0200 Subject: [PATCH 017/304] added function to obtain library version number --- example/rocsparse_handle.cpp | 9 +++++++++ library/include/rocsparse.h | 9 ++++++++- library/src/rocsparse_auxiliary.cpp | 21 +++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/example/rocsparse_handle.cpp b/example/rocsparse_handle.cpp index caf9075b..c4e335ab 100644 --- a/example/rocsparse_handle.cpp +++ b/example/rocsparse_handle.cpp @@ -2,6 +2,7 @@ * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ +#include #include int main(int argc, char *argv[]) @@ -9,6 +10,14 @@ int main(int argc, char *argv[]) rocsparseHandle_t handle; rocsparseCreate(&handle); + int version; + rocsparseGetVersion(handle, &version); + + printf("rocSPARSE version %d.%d.%d\n", + version / 100000, + version / 100 % 1000, + version % 100); + rocsparseDestroy(handle); return 0; diff --git a/library/include/rocsparse.h b/library/include/rocsparse.h index 21cca21f..a79c782e 100644 --- a/library/include/rocsparse.h +++ b/library/include/rocsparse.h @@ -124,7 +124,14 @@ ROCSPARSE_EXPORT rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, rocsparsePointerMode_t *mode); - +/******************************************************************************** + * \brief Get rocSPARSE version + * version % 100 = patch level + * version / 100 % 1000 = minor version + * version / 100000 = major version + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseGetVersion(rocsparseHandle_t handle, int *version); /* * =========================================================================== diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index 47656fc2..f903b833 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -87,3 +87,24 @@ extern "C" rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, log_trace(handle, "rocsparseGetPointerMode", *mode); return ROCSPARSE_STATUS_SUCCESS; } + +/******************************************************************************** + * \brief Get rocSPARSE version + * version % 100 = patch level + * version / 100 % 1000 = minor version + * version / 100000 = major version + *******************************************************************************/ +extern "C" rocsparseStatus_t rocsparseGetVersion(rocsparseHandle_t handle, + int *version) +{ + // Check if handle is valid + if (handle == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + *version = ROCSPARSE_VERSION_MAJOR * 100000 + + ROCSPARSE_VERSION_MINOR * 100 + + ROCSPARSE_VERSION_PATCH; + log_trace(handle, "rocsparseGetVersion", *version); + return ROCSPARSE_STATUS_SUCCESS; +} From bc392fe5e2fc46965db693873663187cec3d39d4 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 18 Apr 2018 10:49:11 +0200 Subject: [PATCH 018/304] stream management --- library/include/rocsparse.h | 19 +++++++++++++++++ library/src/context.cpp | 13 ++++++++++++ library/src/include/context.h | 8 +++++++ library/src/rocsparse_auxiliary.cpp | 33 +++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+) diff --git a/library/include/rocsparse.h b/library/include/rocsparse.h index a79c782e..66f30967 100644 --- a/library/include/rocsparse.h +++ b/library/include/rocsparse.h @@ -14,6 +14,8 @@ #include "rocsparse_version.h" #include "rocsparse_export.h" +#include + #ifdef __cplusplus extern "C" { #endif @@ -124,6 +126,21 @@ ROCSPARSE_EXPORT rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, rocsparsePointerMode_t *mode); +/******************************************************************************** + *! \brief Set rocsparse stream used for all subsequent library function calls. + * If not set, all hip kernels will take the default NULL stream. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseSetStream(rocsparseHandle_t handle, + hipStream_t streamId); + +/******************************************************************************** + *! \brief Get rocsparse stream used for all subsequent library function calls. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseGetStream(rocsparseHandle_t handle, + hipStream_t *streamId); + /******************************************************************************** * \brief Get rocSPARSE version * version % 100 = patch level @@ -133,6 +150,8 @@ rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, ROCSPARSE_EXPORT rocsparseStatus_t rocsparseGetVersion(rocsparseHandle_t handle, int *version); + + /* * =========================================================================== * level 1 SPARSE diff --git a/library/src/context.cpp b/library/src/context.cpp index 95a48258..4c545a56 100644 --- a/library/src/context.cpp +++ b/library/src/context.cpp @@ -54,3 +54,16 @@ rocsparseContext::~rocsparseContext() log_bench_ofs.close(); } } + +rocsparseStatus_t rocsparseContext::setStream(hipStream_t streamId) +{ + // TODO check if stream is valid + stream = streamId; + return ROCSPARSE_STATUS_SUCCESS; +} + +rocsparseStatus_t rocsparseContext::getStream(hipStream_t *streamId) const +{ + *streamId = stream; + return ROCSPARSE_STATUS_SUCCESS; +} diff --git a/library/src/include/context.h b/library/src/include/context.h index 28e029cd..50c6d03f 100644 --- a/library/src/include/context.h +++ b/library/src/include/context.h @@ -16,9 +16,16 @@ ******************************************************************************/ struct rocsparseContext { + // Constructor rocsparseContext(); + // Destructor ~rocsparseContext(); + // Set stream + rocsparseStatus_t setStream(hipStream_t streamId); + // Get stream + rocsparseStatus_t getStream(hipStream_t *streamId) const; + // device id int device; // device properties @@ -32,6 +39,7 @@ struct rocsparseContext // logging mode rocsparseLayerMode_t layer_mode; + // logging streams std::ofstream log_trace_ofs; std::ofstream log_bench_ofs; std::ostream *log_trace_os; diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index f903b833..e6c090a1 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -6,6 +6,8 @@ #include "rocsparse.h" #include "utility.h" +#include + /******************************************************************************** * \brief rocsparseHandle_t is a structure holding the rocsparse library context. * It must be initialized using rocsparseCreate() @@ -88,6 +90,37 @@ extern "C" rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, return ROCSPARSE_STATUS_SUCCESS; } +/******************************************************************************** + *! \brief Set rocsparse stream used for all subsequent library function calls. + * If not set, all hip kernels will take the default NULL stream. + *******************************************************************************/ +extern "C" rocsparseStatus_t rocsparseSetStream(rocsparseHandle_t handle, + hipStream_t streamId) +{ + // Check if handle is valid + if (handle == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + log_trace(handle, "rocsparseSetStream", streamId); + return handle->setStream(streamId); +} + +/******************************************************************************** + *! \brief Get rocsparse stream used for all subsequent library function calls. + *******************************************************************************/ +extern "C" rocsparseStatus_t rocsparseGetStream(rocsparseHandle_t handle, + hipStream_t *streamId) +{ + // Check if handle is valid + if (handle == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + log_trace(handle, "rocsparseGetStream", *streamId); + return handle->getStream(streamId); +} + /******************************************************************************** * \brief Get rocSPARSE version * version % 100 = patch level From 0d974c92e6671be92c83babe3fbda36cecdcabd8 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 18 Apr 2018 11:18:22 +0200 Subject: [PATCH 019/304] matrix descriptor functionality --- library/include/rocsparse.h | 26 ++++++++ library/src/context.cpp | 5 +- library/src/include/context.h | 8 +-- library/src/include/matrix.h | 15 ++--- library/src/matrix.cpp | 99 +++++++++++++++++++++++++---- library/src/rocsparse_auxiliary.cpp | 30 +++++---- 6 files changed, 141 insertions(+), 42 deletions(-) diff --git a/library/include/rocsparse.h b/library/include/rocsparse.h index 66f30967..c69bee31 100644 --- a/library/include/rocsparse.h +++ b/library/include/rocsparse.h @@ -112,6 +112,32 @@ rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA); ROCSPARSE_EXPORT rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA); +/******************************************************************************** + * \brief Set the index base of the matrix descriptor. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseSetMatIndexBase(rocsparseMatDescr_t descrA, + rocsparseIndexBase_t base); + +/******************************************************************************** + * \brief Returns the index base of the matrix descriptor. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseIndexBase_t rocsparseGetMatIndexBase(const rocsparseMatDescr_t descrA); + +/******************************************************************************** + * \brief Set the matrix type of the matrix descriptor. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseStatus_t rocsparseSetMatType(rocsparseMatDescr_t descrA, + rocsparseMatrixType_t type); + +/******************************************************************************** + * \brief Returns the matrix type of the matrix descriptor. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparseMatrixType_t rocsparseGetMatType(const rocsparseMatDescr_t descrA); + /******************************************************************************** * \brief Indicates whether the scalar value pointers are on the host or device. * Set pointer mode, can be host or device diff --git a/library/src/context.cpp b/library/src/context.cpp index 4c545a56..e65be1e6 100644 --- a/library/src/context.cpp +++ b/library/src/context.cpp @@ -12,10 +12,7 @@ rocsparseContext::rocsparseContext() // Default device is active device THROW_IF_HIP_ERROR(hipGetDevice(&device)); THROW_IF_HIP_ERROR(hipGetDeviceProperties(&properties, device)); - // Default is system stream - stream = 0; - // Default pointer mode is host - pointer_mode = ROCSPARSE_POINTER_MODE_HOST; + // Device warp size warp_size = properties.warpSize; diff --git a/library/src/include/context.h b/library/src/include/context.h index 50c6d03f..49c68e45 100644 --- a/library/src/include/context.h +++ b/library/src/include/context.h @@ -32,10 +32,10 @@ struct rocsparseContext hipDeviceProp_t properties; // device warp size int warp_size; - // stream - hipStream_t stream; - // pointer mode - rocsparsePointerMode_t pointer_mode; + // stream ; default stream is system stream NULL + hipStream_t stream = 0; + // pointer mode ; default mode is host + rocsparsePointerMode_t pointer_mode = ROCSPARSE_POINTER_MODE_HOST; // logging mode rocsparseLayerMode_t layer_mode; diff --git a/library/src/include/matrix.h b/library/src/include/matrix.h index 392db518..7b75abd1 100644 --- a/library/src/include/matrix.h +++ b/library/src/include/matrix.h @@ -9,15 +9,14 @@ struct rocsparseMatDescr { - // Constructor - rocsparseMatDescr(); - // Destructor - ~rocsparseMatDescr(); - - // Matrix index base - rocsparseIndexBase_t base; // Matrix type - rocsparseMatrixType_t type; + rocsparseMatrixType_t type = ROCSPARSE_MATRIX_TYPE_GENERAL; + // Fill mode TODO +// rocsparseFillMode_t fill; + // Diagonal type +// rocsparseDiagType_t diag; + // Index base + rocsparseIndexBase_t base = ROCSPARSE_INDEX_BASE_ZERO; }; #endif // ROCSPARSE_MATRIX_H_ diff --git a/library/src/matrix.cpp b/library/src/matrix.cpp index 93d90963..a7e9cce2 100644 --- a/library/src/matrix.cpp +++ b/library/src/matrix.cpp @@ -7,19 +7,15 @@ -rocsparseMatDescr::rocsparseMatDescr() -{ - base = ROCSPARSE_INDEX_BASE_ZERO; - type = ROCSPARSE_MATRIX_TYPE_GENERAL; -} - -rocsparseMatDescr::~rocsparseMatDescr() -{ -} - - - -extern "C" rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA) +/******************************************************************************** + * \brief rocsparseCreateMatDescr_t is a structure holding the rocsparse matrix + * descriptor. It must be initialized using rocsparseCreateMatDescr() + * and the retured handle must be passed to all subsequent library function + * calls that involve the matrix. + * It should be destroyed at the end using rocsparseDestroyMatDescr(). + *******************************************************************************/ +extern "C" +rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA) { if (descrA == nullptr) { @@ -40,7 +36,11 @@ extern "C" rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA } } -extern "C" rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA) +/******************************************************************************** + * \brief destroy matrix descriptor + *******************************************************************************/ +extern "C" +rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA) { // Destruct try @@ -53,3 +53,74 @@ extern "C" rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA } return ROCSPARSE_STATUS_SUCCESS; } + +/******************************************************************************** + * \brief Set the index base of the matrix descriptor. + *******************************************************************************/ +extern "C" +rocsparseStatus_t rocsparseSetMatIndexBase(rocsparseMatDescr_t descrA, + rocsparseIndexBase_t base) +{ + // Check if descriptor is valid + if (descrA == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + if (base != ROCSPARSE_INDEX_BASE_ZERO && + base != ROCSPARSE_INDEX_BASE_ONE) + { + return ROCSPARSE_STATUS_INVALID_VALUE; + } + descrA->base = base; + return ROCSPARSE_STATUS_SUCCESS; +} + +/******************************************************************************** + * \brief Returns the index base of the matrix descriptor. + *******************************************************************************/ +extern "C" +rocsparseIndexBase_t rocsparseGetMatIndexBase(const rocsparseMatDescr_t descrA) +{ + // If descriptor is invalid, default index base is returned + if (descrA == nullptr) + { + return ROCSPARSE_INDEX_BASE_ZERO; + } + return descrA->base; +} + +/******************************************************************************** + * \brief Set the matrix type of the matrix descriptor. + *******************************************************************************/ +extern "C" +rocsparseStatus_t rocsparseSetMatType(rocsparseMatDescr_t descrA, + rocsparseMatrixType_t type) +{ + // Check if descriptor is valid + if (descrA == nullptr) + { + return ROCSPARSE_STATUS_INVALID_POINTER; + } + if (type != ROCSPARSE_MATRIX_TYPE_GENERAL && + type != ROCSPARSE_MATRIX_TYPE_SYMMETRIC && + type != ROCSPARSE_MATRIX_TYPE_HERMITIAN) + { + return ROCSPARSE_STATUS_INVALID_VALUE; + } + descrA->type = type; + return ROCSPARSE_STATUS_SUCCESS; +} + +/******************************************************************************** + * \brief Returns the matrix type of the matrix descriptor. + *******************************************************************************/ +extern "C" +rocsparseMatrixType_t rocsparseGetMatType(const rocsparseMatDescr_t descrA) +{ + // If descriptor is invalid, default matrix type is returned + if (descrA == nullptr) + { + return ROCSPARSE_MATRIX_TYPE_GENERAL; + } + return descrA->type; +} diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index e6c090a1..b317cbc7 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -15,7 +15,8 @@ * to all subsequent library function calls. * It should be destroyed at the end using rocsparseDestroy(). *******************************************************************************/ -extern "C" rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) +extern "C" +rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) { // Check if handle is valid if (handle == nullptr) @@ -42,7 +43,8 @@ extern "C" rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) /******************************************************************************** * \brief destroy handle *******************************************************************************/ -extern "C" rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) +extern "C" +rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) { log_trace(handle, "rocsparseDestroy"); // Destruct @@ -61,8 +63,9 @@ extern "C" rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) * \brief Indicates whether the scalar value pointers are on the host or device. * Set pointer mode, can be host or device *******************************************************************************/ -extern "C" rocsparseStatus_t rocsparseSetPointerMode(rocsparseHandle_t handle, - rocsparsePointerMode_t mode) +extern "C" +rocsparseStatus_t rocsparseSetPointerMode(rocsparseHandle_t handle, + rocsparsePointerMode_t mode) { // Check if handle is valid if (handle == nullptr) @@ -77,8 +80,9 @@ extern "C" rocsparseStatus_t rocsparseSetPointerMode(rocsparseHandle_t handle, /******************************************************************************** * \brief Get pointer mode, can be host or device. *******************************************************************************/ -extern "C" rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, - rocsparsePointerMode_t *mode) +extern "C" +rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, + rocsparsePointerMode_t *mode) { // Check if handle is valid if (handle == nullptr) @@ -94,8 +98,9 @@ extern "C" rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, *! \brief Set rocsparse stream used for all subsequent library function calls. * If not set, all hip kernels will take the default NULL stream. *******************************************************************************/ -extern "C" rocsparseStatus_t rocsparseSetStream(rocsparseHandle_t handle, - hipStream_t streamId) +extern "C" +rocsparseStatus_t rocsparseSetStream(rocsparseHandle_t handle, + hipStream_t streamId) { // Check if handle is valid if (handle == nullptr) @@ -109,8 +114,9 @@ extern "C" rocsparseStatus_t rocsparseSetStream(rocsparseHandle_t handle, /******************************************************************************** *! \brief Get rocsparse stream used for all subsequent library function calls. *******************************************************************************/ -extern "C" rocsparseStatus_t rocsparseGetStream(rocsparseHandle_t handle, - hipStream_t *streamId) +extern "C" +rocsparseStatus_t rocsparseGetStream(rocsparseHandle_t handle, + hipStream_t *streamId) { // Check if handle is valid if (handle == nullptr) @@ -127,8 +133,8 @@ extern "C" rocsparseStatus_t rocsparseGetStream(rocsparseHandle_t handle, * version / 100 % 1000 = minor version * version / 100000 = major version *******************************************************************************/ -extern "C" rocsparseStatus_t rocsparseGetVersion(rocsparseHandle_t handle, - int *version) +extern "C" +rocsparseStatus_t rocsparseGetVersion(rocsparseHandle_t handle, int *version) { // Check if handle is valid if (handle == nullptr) From 87d5b628c4bc943e3b191b71c60cb4df3c667e8a Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 18 Apr 2018 14:58:39 +0200 Subject: [PATCH 020/304] hip error handling --- library/src/include/definitions.h | 35 ++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/library/src/include/definitions.h b/library/src/include/definitions.h index 2299cf60..1f8cd5e6 100644 --- a/library/src/include/definitions.h +++ b/library/src/include/definitions.h @@ -14,13 +14,34 @@ * thereby it can include top-level definitions included by all ******************************************************************************/ -#define THROW_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ - { \ - hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if(TMP_STATUS_FOR_CHECK != hipSuccess) \ - { \ - throw get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ - } \ +#define RETURN_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if(TMP_STATUS_FOR_CHECK != hipSuccess) \ + { \ + return get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ + } \ + } +#define THROW_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if(TMP_STATUS_FOR_CHECK != hipSuccess) \ + { \ + throw get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ + } \ + } + +#define PRINT_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if(TMP_STATUS_FOR_CHECK != hipSuccess) \ + { \ + fprintf(stderr, \ + "hip error code: %d at %s:%d\n", \ + TMP_STATUS_FOR_CHECK, \ + __FILE__, \ + __LINE__); \ + } \ } #endif // ROCSPARSE_DEFINITIONS_H_ From 0f03743189f617cc6ae8b84f839bc0b0167f133f Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 19 Apr 2018 08:43:48 +0200 Subject: [PATCH 021/304] bugfix rocsparse_version.h installed to wrong dir --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 315bf894..a5dd545a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,8 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Add paths to linker search and installed rpath") + # CXX Build flags set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) From c870baf2c34d81e8416c0bdd42e9c3b689c093e5 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 2 May 2018 19:30:48 +0200 Subject: [PATCH 022/304] updated to point 2 option 1 (see issue tracker) --- CMakeLists.txt | 152 +-------- benchmark/benchmark_csrmv_mtx.cpp | 52 +-- benchmark/benchmark_spmv.cpp | 52 +-- cmake/Dependencies.cmake | 131 +++++++ cmake/SetToolchain.cmake | 47 +++ cmake/Verbose.cmake | 24 ++ example/rocsparse_handle.cpp | 8 +- library/CMakeLists.txt | 107 +++--- library/include/rocsparse-auxiliary.h | 120 +++++++ library/include/rocsparse-functions.h | 191 +++++++++++ library/include/rocsparse-types.h | 87 +++++ ...se_version.h.in => rocsparse-version.h.in} | 13 +- library/include/rocsparse.h | 275 +-------------- library/src/CMakeLists.txt | 6 +- library/src/context.cpp | 66 ---- library/src/handle.cpp | 86 +++++ library/src/include/context.h | 49 --- library/src/include/definitions.h | 8 +- library/src/include/handle.h | 73 ++++ library/src/include/logging.h | 9 +- library/src/include/matrix.h | 22 -- library/src/include/status.h | 12 +- library/src/include/utility.h | 31 +- library/src/level1/rocsparse_axpyi.cpp | 207 +++++++++++ library/src/level2/rocsparse_csrmv.cpp | 211 +++++++----- library/src/matrix.cpp | 126 ------- library/src/rocsparse_auxiliary.cpp | 194 ++++++++--- library/src/status.cpp | 18 +- test/test_rocsparse_csrmv.cpp | 322 +++++++++--------- test/test_rocsparse_handle.cpp | 8 +- 30 files changed, 1597 insertions(+), 1110 deletions(-) create mode 100644 cmake/Dependencies.cmake create mode 100644 cmake/SetToolchain.cmake create mode 100644 cmake/Verbose.cmake create mode 100644 library/include/rocsparse-auxiliary.h create mode 100644 library/include/rocsparse-functions.h create mode 100644 library/include/rocsparse-types.h rename library/include/{rocsparse_version.h.in => rocsparse-version.h.in} (61%) delete mode 100644 library/src/context.cpp create mode 100644 library/src/handle.cpp delete mode 100644 library/src/include/context.h create mode 100644 library/src/include/handle.h delete mode 100644 library/src/include/matrix.h create mode 100644 library/src/level1/rocsparse_axpyi.cpp delete mode 100644 library/src/matrix.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index a5dd545a..f34871e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,48 +6,24 @@ cmake_minimum_required(VERSION 3.5 FATAL_ERROR) # Consider removing this in the future # This should appear before the project command, because it does not use FORCE -if( WIN32 ) - set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) -else( ) - set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) -endif( ) +if(WIN32) + set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories") +else() + set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") +endif() # CMake modules list(APPEND CMAKE_MODULE_PATH - ${CMAKE_CURRENT_SOURCE_DIR}/cmake - /opt/rocm/hip/cmake + ${CMAKE_CURRENT_SOURCE_DIR}/cmake + /opt/rocm/hip/cmake ) -# Find HIP package -find_package(HIP REQUIRED) - -# Set compiler -if(HIP_PLATFORM STREQUAL "nvcc") - message("-- PLATFORM = nvcc currently unsupported") -elseif(HIP_PLATFORM STREQUAL "hcc") - message("-- PLATFORM = hcc") - find_program(HIP_HCC_EXECUTABLE NAMES hcc PATHS - "${HIP_ROOT_DIR}" - ENV ROCM_PATH - ENV HIP_PATH - /opt/rocm - /opt/rocm/hip - PATH_SUFFIXES bin - NO_DEFAULT_PATH - ) - if(NOT HIP_HCC_EXECUTABLE) - find_program(HIP_HCC_EXECUTABLE hcc) - endif() - mark_as_advanced(HIP_HCC_EXECUTABLE) - set(CMAKE_CXX_COMPILER ${HIP_HCC_EXECUTABLE}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument") - list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip) -else() - message(FATAL_ERROR "HIP_PLATFORM must be 'hcc/nvcc' (AMD ROCm platform).") -endif() +# Set toolchain +include(cmake/SetToolchain.cmake) # rocSPARSE project project(rocsparse VERSION 0.1.0.0 LANGUAGES CXX) +set(rocsparse_SOVERSION 0) # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) @@ -58,120 +34,20 @@ endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Add paths to linker search and installed rpath") -# CXX Build flags +# Build flags set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") - -# HIP configuration -if(HIP_PLATFORM STREQUAL "hcc") - # Workaround until hcc & hip cmake modules fixes symlink logic in their config files. - # (Thanks to rocBLAS devs for finding workaround for this problem!) - list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip) - # Ignore hcc warning: argument unused during compilation: '-isystem /opt/rocm/hip/include' - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument") - find_package(hcc REQUIRED CONFIG PATHS /opt/rocm) - find_package(hip REQUIRED CONFIG PATHS /opt/rocm) -endif() # Build options option(BUILD_SHARED_LIBS "Build rocSPARSE as a shared library" ON) -option(BUILD_VERBOSE "Output additional build information" OFF) option(BUILD_TEST "Build tests (requires googletest)" OFF) option(BUILD_BENCHMARK "Build benchmarks (requires googlebenchmark)" OFF) option(BUILD_EXAMPLE "Build examples" ON) +option(BUILD_VERBOSE "Output additional build information" OFF) -# Test dependencies -if(BUILD_TEST) - if(NOT DEPENDENCIES_FORCE_DOWNLOAD) - find_package(GTest QUIET) - endif() - if(NOT GTEST_FOUND) - message(STATUS "GTest not found. Downloading and building GTest.") - include(cmake/DownloadProject.cmake) - find_package(Git REQUIRED) - set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "") - download_project(PROJ googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG master - INSTALL_DIR ${GTEST_ROOT} - CMAKE_ARGS -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= - LOG_DOWNLOAD TRUE - LOG_CONFIGURE TRUE - LOG_BUILD TRUE - LOG_INSTALL TRUE - BUILD_PROJECT TRUE - UPDATE_DISCONNECTED TRUE - ) - endif() - find_package(GTest REQUIRED) - # Download some test matrices - set(TEST_MATRICES - nos1 - nos2 - nos3 - nos4 - nos5 - nos6 - nos7 - ) - foreach(m ${TEST_MATRICES}) - if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx") - file(DOWNLOAD ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/lanpro/${m}.mtx.gz - ${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx.gz) - execute_process(COMMAND gzip -d -f ${m}.mtx.gz - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) - endif() - endforeach() -endif() - -# Benchmark dependencies -if(BUILD_BENCHMARK) - if(NOT DEPENDENCIES_FORCE_DOWNLOAD) - find_package(benchmark QUIET) - endif() - if(NOT benchmark_FOUND) - message(STATUS "Google Benchmark not found. Downloading and building Google Benchmark.") - include(cmake/DownloadProject.cmake) - find_package(Git REQUIRED) - set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/googlebenchmark CACHE PATH "") - download_project(PROJ googlebenchmark - GIT_REPOSITORY https://github.com/google/benchmark.git - GIT_TAG master - INSTALL_DIR ${GOOGLEBENCHMARK_ROOT} - CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBENCHMARK_ENABLE_TESTING=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= - LOG_DOWNLOAD TRUE - LOG_CONFIGURE TRUE - LOG_BUILD TRUE - LOG_INSTALL TRUE - BUILD_PROJECT TRUE - UPDATE_DISCONNECTED TRUE - ) - endif() - find_package(benchmark REQUIRED CONFIG PATHS ${GOOGLEBENCHMARK_ROOT}) -endif() - -# ROCm cmake project -find_package(ROCM QUIET CONFIG PATHS /opt/rocm) -if(NOT ROCM_FOUND) - set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") - file( - DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip - ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip - ) - execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - ) - find_package(ROCM REQUIRED CONFIG PATHS ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}) -endif() - -include(ROCMSetupVersion) -include(ROCMCreatePackage) -include(ROCMInstallTargets) -include(ROCMPackageConfigHelpers) -include(ROCMInstallSymlinks) +# Dependencies +include(cmake/Dependencies.cmake) # AMD targets set(AMDGPU_TARGETS gfx803;gfx900 CACHE STRING "List of specific machine types for library to target") diff --git a/benchmark/benchmark_csrmv_mtx.cpp b/benchmark/benchmark_csrmv_mtx.cpp index a3568957..068f3446 100644 --- a/benchmark/benchmark_csrmv_mtx.cpp +++ b/benchmark/benchmark_csrmv_mtx.cpp @@ -20,40 +20,40 @@ #define ROCSPARSE_CHECK(stat) \ { \ - rocsparseStatus_t err = stat; \ - if (err != ROCSPARSE_STATUS_SUCCESS) \ + rocsparse_status err = stat; \ + if (err != rocsparse_status_success) \ { \ fprintf(stderr, "ROCSPARSE error: %d line: %d\n", err, __LINE__); \ exit(stat); \ } \ } -void csrmv(rocsparseHandle_t handle, rocsparseOperation_t trans, +void csrmv(rocsparse_handle handle, rocsparse_operation trans, int nrow, int ncol, int nnz, const float *alpha, - rocsparseMatDescr_t descrA, const float *csrValA, + rocsparse_mat_descr descrA, const float *csrValA, const int *csrRowPtrA, const int *csrColIndA, const float *x, const float *beta, float *y) { - ROCSPARSE_CHECK(rocsparseScsrmv(handle, trans, nrow, ncol, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, trans, nrow, ncol, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y)); } -void csrmv(rocsparseHandle_t handle, rocsparseOperation_t trans, +void csrmv(rocsparse_handle handle, rocsparse_operation trans, int nrow, int ncol, int nnz, const double *alpha, - rocsparseMatDescr_t descrA, const double *csrValA, + rocsparse_mat_descr descrA, const double *csrValA, const int *csrRowPtrA, const int *csrColIndA, const double *x, const double *beta, double *y) { - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, trans, nrow, ncol, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, trans, nrow, ncol, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y)); } template void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, - rocsparseHandle_t handle, rocsparseOperation_t trans, - int nrow, int ncol, int nnz, rocsparseMatDescr_t descr, + rocsparse_handle handle, rocsparse_operation trans, + int nrow, int ncol, int nnz, rocsparse_mat_descr descr, const ValueType *alpha, const ValueType *csrValA, const int *csrRowPtrA, const int *csrColIndA, const ValueType *x, const ValueType *beta, ValueType *y) @@ -61,7 +61,7 @@ void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, // Warm up for (int i=0; i<10; ++i) { - csrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + csrmv(handle, rocsparse_operation_none, nrow, ncol, nnz, alpha, descr, csrValA, csrRowPtrA, csrColIndA, x, beta, y); } @@ -73,7 +73,7 @@ void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, for (size_t i=0; i benchmarks = { - benchmark::RegisterBenchmark("rocsparseScsrmv", run_benchmark, + benchmark::RegisterBenchmark("rocsparse_scsrmv", run_benchmark, stream, batch_size, - handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + handle, rocsparse_operation_none, nrow, nrow, nnz, descrA, &alphaf, dAvalf, dAptr, dAcol, dxf, &betaf, dyf), - benchmark::RegisterBenchmark("rocsparseDcsrmv", run_benchmark, + benchmark::RegisterBenchmark("rocsparse_dcsrmv", run_benchmark, stream, batch_size, - handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + handle, rocsparse_operation_none, nrow, nrow, nnz, descrA, &alphad, dAvald, dAptr, dAcol, dxd, &betad, dyd) }; @@ -249,8 +249,8 @@ int main(int argc, char *argv[]) HIP_CHECK(hipFree(dyf)); HIP_CHECK(hipFree(dyd)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); return 0; } diff --git a/benchmark/benchmark_spmv.cpp b/benchmark/benchmark_spmv.cpp index 2f4a6ae1..9c7d0d59 100644 --- a/benchmark/benchmark_spmv.cpp +++ b/benchmark/benchmark_spmv.cpp @@ -20,40 +20,40 @@ #define ROCSPARSE_CHECK(stat) \ { \ - rocsparseStatus_t err = stat; \ - if (err != ROCSPARSE_STATUS_SUCCESS) \ + rocsparse_status err = stat; \ + if (err != rocsparse_status_success) \ { \ fprintf(stderr, "ROCSPARSE error: %d line: %d\n", err, __LINE__); \ exit(stat); \ } \ } -void csrmv(rocsparseHandle_t handle, rocsparseOperation_t trans, +void csrmv(rocsparse_handle handle, rocsparse_operation trans, int nrow, int ncol, int nnz, const float *alpha, - rocsparseMatDescr_t descrA, const float *csrValA, + rocsparse_mat_descr descrA, const float *csrValA, const int *csrRowPtrA, const int *csrColIndA, const float *x, const float *beta, float *y) { - ROCSPARSE_CHECK(rocsparseScsrmv(handle, trans, nrow, ncol, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, trans, nrow, ncol, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y)); } -void csrmv(rocsparseHandle_t handle, rocsparseOperation_t trans, +void csrmv(rocsparse_handle handle, rocsparse_operation trans, int nrow, int ncol, int nnz, const double *alpha, - rocsparseMatDescr_t descrA, const double *csrValA, + rocsparse_mat_descr descrA, const double *csrValA, const int *csrRowPtrA, const int *csrColIndA, const double *x, const double *beta, double *y) { - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, trans, nrow, ncol, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, trans, nrow, ncol, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y)); } template void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, - rocsparseHandle_t handle, rocsparseOperation_t trans, - int nrow, int ncol, int nnz, rocsparseMatDescr_t descr, + rocsparse_handle handle, rocsparse_operation trans, + int nrow, int ncol, int nnz, rocsparse_mat_descr descr, const ValueType *alpha, const ValueType *csrValA, const int *csrRowPtrA, const int *csrColIndA, const ValueType *x, const ValueType *beta, ValueType *y) @@ -61,7 +61,7 @@ void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, // Warm up for (int i=0; i<10; ++i) { - csrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + csrmv(handle, rocsparse_operation_none, nrow, ncol, nnz, alpha, descr, csrValA, csrRowPtrA, csrColIndA, x, beta, y); } @@ -73,7 +73,7 @@ void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, for (size_t i=0; i benchmarks = { - benchmark::RegisterBenchmark("rocsparseScsrmv", run_benchmark, + benchmark::RegisterBenchmark("rocsparse_scsrmv", run_benchmark, stream, batch_size, - handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + handle, rocsparse_operation_none, nrow, nrow, nnz, descrA, &alphaf, dAvalf, dAptr, dAcol, dxf, &betaf, dyf), - benchmark::RegisterBenchmark("rocsparseDcsrmv", run_benchmark, + benchmark::RegisterBenchmark("rocsparse_dcsrmv", run_benchmark, stream, batch_size, - handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, + handle, rocsparse_operation_none, nrow, nrow, nnz, descrA, &alphad, dAvald, dAptr, dAcol, dxd, &betad, dyd) }; @@ -225,8 +225,8 @@ int main(int argc, char *argv[]) HIP_CHECK(hipFree(dyf)); HIP_CHECK(hipFree(dyd)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); return 0; } diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake new file mode 100644 index 00000000..86c3e922 --- /dev/null +++ b/cmake/Dependencies.cmake @@ -0,0 +1,131 @@ +# ######################################################################## +# Copyright 2018 Advanced Micro Devices, Inc. +# ######################################################################## + +# Dependencies + +# Git +find_package(Git REQUIRED) + +# DownloadProject package +include(cmake/DownloadProject.cmake) + +# HIP configuration +if(HIP_PLATFORM STREQUAL "hcc") + # Workaround until hcc & hip cmake modules fixes symlink logic in their config files. + # (Thanks to rocBLAS devs for finding workaround for this problem!) + list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip) + # Ignore hcc warning: argument unused during compilation: '-isystem /opt/rocm/hip/include' + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unused-command-line-argument") + find_package(hcc REQUIRED CONFIG PATHS /opt/rocm) + find_package(hip REQUIRED CONFIG PATHS /opt/rocm) +elseif(HIP_PLATFORM STREQUAL "nvcc") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xcompiler -Wall") + set(CMAKE_C_COMPILE_OPTIONS_PIC "-Xcompiler ${CMAKE_C_COMPILE_OPTIONS_PIC}" ) + set(CMAKE_CXX_COMPILE_OPTIONS_PIC "-Xcompiler ${CMAKE_CXX_COMPILE_OPTIONS_PIC}" ) + set(CMAKE_SHARED_LIBRARY_C_FLAGS "-Xlinker ${CMAKE_SHARED_LIBRARY_C_FLAGS}" ) + set(CMAKE_SHARED_LIBRARY_CXX_FLAGS "-Xlinker ${CMAKE_SHARED_LIBRARY_CXX_FLAGS}" ) + set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Xlinker -soname," ) + set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Xlinker -soname," ) + set(CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG "-Xlinker -rpath," ) + set(CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG "-Xlinker -rpath," ) + set(CMAKE_EXECUTABLE_RUNTIME_C_FLAG "-Xlinker -rpath," ) + set(CMAKE_EXECUTABLE_RUNTIME_CXX_FLAG "-Xlinker -rpath," ) + set(CMAKE_C_COMPILE_OPTIONS_VISIBILITY "-Xcompiler ${CMAKE_C_COMPILE_OPTIONS_VISIBILITY}" ) + set(CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY "-Xcompiler ${CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY}" ) + set(CMAKE_C_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN "-Xcompiler ${CMAKE_C_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN}" ) + set(CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN "-Xcompiler ${CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN}" ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -arch sm_35") +endif() + +# Test dependencies +if(BUILD_TEST) + if(NOT DEPENDENCIES_FORCE_DOWNLOAD) + find_package(GTest QUIET) + endif() + if(NOT GTEST_FOUND) + message(STATUS "GTest not found. Downloading and building GTest.") + set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "") + download_project(PROJ googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG master + INSTALL_DIR ${GTEST_ROOT} + CMAKE_ARGS -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= + LOG_DOWNLOAD TRUE + LOG_CONFIGURE TRUE + LOG_BUILD TRUE + LOG_INSTALL TRUE + BUILD_PROJECT TRUE + UPDATE_DISCONNECTED TRUE + ) + endif() + find_package(GTest REQUIRED) + # Download some test matrices + set(TEST_MATRICES + nos1 + nos2 + nos3 + nos4 + nos5 + nos6 + nos7 + ) + foreach(m ${TEST_MATRICES}) + if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx") + file(DOWNLOAD ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/lanpro/${m}.mtx.gz + ${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx.gz) + execute_process(COMMAND gzip -d -f ${m}.mtx.gz + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) + endif() + endforeach() +endif() + +# Benchmark dependencies +if(BUILD_BENCHMARK) + if(NOT DEPENDENCIES_FORCE_DOWNLOAD) + find_package(benchmark QUIET) + endif() + if(NOT benchmark_FOUND) + message(STATUS "Google Benchmark not found. Downloading and building Google Benchmark.") + set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/googlebenchmark CACHE PATH "") + download_project(PROJ googlebenchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG master + INSTALL_DIR ${GOOGLEBENCHMARK_ROOT} + CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBENCHMARK_ENABLE_TESTING=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= + LOG_DOWNLOAD TRUE + LOG_CONFIGURE TRUE + LOG_BUILD TRUE + LOG_INSTALL TRUE + BUILD_PROJECT TRUE + UPDATE_DISCONNECTED TRUE + ) + endif() + find_package(benchmark REQUIRED CONFIG PATHS ${GOOGLEBENCHMARK_ROOT}) +endif() + +# rocPRIM package +message(STATUS "Downloading rocPRIM.") +download_project(PROJ rocPRIM + GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git + GIT_TAG master +) + +# ROCm package +find_package(ROCM QUIET CONFIG PATHS /opt/rocm) +if(NOT ROCM_FOUND) + set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") + file(DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip + ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip + ) + execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + find_package(ROCM REQUIRED CONFIG PATHS ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}) +endif() + +include(ROCMSetupVersion) +include(ROCMCreatePackage) +include(ROCMInstallTargets) +include(ROCMPackageConfigHelpers) +include(ROCMInstallSymlinks) diff --git a/cmake/SetToolchain.cmake b/cmake/SetToolchain.cmake new file mode 100644 index 00000000..467176da --- /dev/null +++ b/cmake/SetToolchain.cmake @@ -0,0 +1,47 @@ +# Find HIP package +find_package(HIP REQUIRED) + +# Select toolchain +if(HIP_PLATFORM STREQUAL "nvcc") + # Find HIPCC executable + find_program( + HIP_HIPCC_EXECUTABLE + NAMES hipcc + PATHS + "${HIP_ROOT_DIR}" + ENV ROCM_PATH + ENV HIP_PATH + /opt/rocm + /opt/rocm/hip + PATH_SUFFIXES bin + NO_DEFAULT_PATH + ) + if(NOT HIP_HIPCC_EXECUTABLE) + # Now search in default paths + find_program(HIP_HIPCC_EXECUTABLE hipcc) + endif() + mark_as_advanced(HIP_HIPCC_EXECUTABLE) + set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +elseif(HIP_PLATFORM STREQUAL "hcc") + # Find HCC executable + find_program( + HIP_HCC_EXECUTABLE + NAMES hcc + PATHS + "${HIP_ROOT_DIR}" + ENV ROCM_PATH + ENV HIP_PATH + /opt/rocm + /opt/rocm/hip + PATH_SUFFIXES bin + NO_DEFAULT_PATH + ) + if(NOT HIP_HCC_EXECUTABLE) + # Now search in default paths + find_program(HIP_HCC_EXECUTABLE hcc) + endif() + mark_as_advanced(HIP_HCC_EXECUTABLE) + set(CMAKE_CXX_COMPILER ${HIP_HCC_EXECUTABLE}) +else() + message(FATAL_ERROR "HIP_PLATFORM must be 'hcc/nvcc' (AMD ROCm platform).") +endif() diff --git a/cmake/Verbose.cmake b/cmake/Verbose.cmake new file mode 100644 index 00000000..87ef25fb --- /dev/null +++ b/cmake/Verbose.cmake @@ -0,0 +1,24 @@ +message(STATUS "rocsparse_VERSION : ${rocfft_VERSION}") +message(STATUS "\t==>CMAKE_BUILD_TYPE : ${CMAKE_BUILD_TYPE}") +message(STATUS "\t==>BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}") +message(STATUS "\t==>CMAKE_INSTALL_PREFIX link : ${CMAKE_INSTALL_PREFIX}") +message(STATUS "\t==>CMAKE_MODULE_PATH link : ${CMAKE_MODULE_PATH}") +message(STATUS "\t==>CMAKE_PREFIX_PATH link : ${CMAKE_PREFIX_PATH}") +message(STATUS "==============") +message(STATUS "\t==>CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}") +message(STATUS "\t>>=HIP_ROOT_DIR : ${HIP_ROOT_DIR}") +message(STATUS "\t==>CMAKE_CXX_COMPILER : ${CMAKE_CXX_FLAGS}") +message(STATUS "\t==>CMAKE_CXX_COMPILER_VERSION : ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "\t==>CMAKE_CXX_COMPILER debug : ${CMAKE_CXX_FLAGS_DEBUG}") +message(STATUS "\t==>CMAKE_CXX_COMPILER release : ${CMAKE_CXX_FLAGS_RELEASE}") +message(STATUS "\t==>CMAKE_CXX_COMPILER relwithdebinfo : ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") +message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS : ${CMAKE_EXE_LINKER_FLAGS}") +message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS_RELEASE : ${CMAKE_EXE_LINKER_FLAGS_RELEASE}") +message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS : ${CMAKE_SHARED_LINKER_FLAGS}") +message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE : ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}") +message(STATUS "==============" ) +message(STATUS "\t==>CMAKE_SHARED_LIBRARY_C_FLAGS : ${CMAKE_SHARED_LIBRARY_C_FLAGS}") +message(STATUS "\t==>CMAKE_SHARED_LIBRARY_CXX_FLAGS : ${CMAKE_SHARED_LIBRARY_CXX_FLAGS}") +message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS : ${CMAKE_SHARED_LINKER_FLAGS}") +message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_DEBUG : ${CMAKE_SHARED_LINKER_FLAGS_DEBUG}") +message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE : ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}") diff --git a/example/rocsparse_handle.cpp b/example/rocsparse_handle.cpp index c4e335ab..08c1160a 100644 --- a/example/rocsparse_handle.cpp +++ b/example/rocsparse_handle.cpp @@ -7,18 +7,18 @@ int main(int argc, char *argv[]) { - rocsparseHandle_t handle; - rocsparseCreate(&handle); + rocsparse_handle handle; + rocsparse_create_handle(&handle); int version; - rocsparseGetVersion(handle, &version); + rocsparse_get_version(handle, &version); printf("rocSPARSE version %d.%d.%d\n", version / 100000, version / 100 % 1000, version % 100); - rocsparseDestroy(handle); + rocsparse_destroy_handle(handle); return 0; } diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index 8ba24a3d..ea9e6847 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -1,95 +1,76 @@ # ######################################################################## -# Copyright 2016 Advanced Micro Devices, Inc. +# Copyright 2018 Advanced Micro Devices, Inc. # ######################################################################## -# Package that helps me set visibility for function names exported from shared library -include(GenerateExportHeader) - -# Verbose build info +# Print verbose compiler flags if(BUILD_VERBOSE) - message(STATUS "rocsparse_VERSION: ${rocsparse_VERSION}") - message(STATUS "\t==>CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") - message(STATUS "\t==>BUILD_SHARED_LIBS: ${BUILD_SHARED_LIBS}") - message(STATUS "\t==>CMAKE_INSTALL_PREFIX link: " ${CMAKE_INSTALL_PREFIX}) - message(STATUS "\t==>CMAKE_MODULE_PATH link: " ${CMAKE_MODULE_PATH}) - message(STATUS "\t==>CMAKE_PREFIX_PATH link: " ${CMAKE_PREFIX_PATH}) - message(STATUS "==============") - message(STATUS "\t==>CMAKE_CXX_COMPILER: " ${CMAKE_CXX_FLAGS}) - message(STATUS "\t==>CMAKE_CXX_COMPILER debug: " ${CMAKE_CXX_FLAGS_DEBUG}) - message(STATUS "\t==>CMAKE_CXX_COMPILER release: " ${CMAKE_CXX_FLAGS_RELEASE}) - message(STATUS "\t==>CMAKE_CXX_COMPILER relwithdebinfo: " ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) - message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS: " ${CMAKE_EXE_LINKER_FLAGS}) - message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS_RELEASE: " ${CMAKE_EXE_LINKER_FLAGS_RELEASE}) - message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS: " ${CMAKE_SHARED_LINKER_FLAGS}) - message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE: " ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}) - message(STATUS "==============") - message(STATUS "\t==>CMAKE_SHARED_LIBRARY_C_FLAGS: ${CMAKE_SHARED_LIBRARY_C_FLAGS}") - message(STATUS "\t==>CMAKE_SHARED_LIBRARY_CXX_FLAGS: ${CMAKE_SHARED_LIBRARY_CXX_FLAGS}") - message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS: ${CMAKE_SHARED_LINKER_FLAGS}") - message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_DEBUG: ${CMAKE_SHARED_LINKER_FLAGS_DEBUG}") - message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE: ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}") + include(../cmake/Verbose.cmake) endif() -# .so version -set(rocsparse_SOVERSION 0) - # Configure a header file to pass the rocSPARSE version -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/include/rocsparse_version.h.in" - "${CMAKE_CURRENT_BINARY_DIR}/include/rocsparse_version.h" - @ONLY +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/include/rocsparse-version.h.in" + "${PROJECT_BINARY_DIR}/include/rocsparse-version.h" + @ONLY +) + +# Public rocSPARSE headers +set(rocsparse_headers_public + include/rocsparse-auxiliary.h + include/rocsparse-functions.h + include/rocsparse-types.h + include/rocsparse.h + ${PROJECT_BINARY_DIR}/include/rocsparse-version.h + ${PROJECT_BINARY_DIR}/include/rocsparse-export.h ) # Include sources include(src/CMakeLists.txt) -# Create library from sources -if(BUILD_SHARED_LIBS) - add_library(rocsparse SHARED ${rocsparse_source}) -else() - add_library(rocsparse STATIC ${rocsparse_source}) -endif() +# Create rocSPARSE library +add_library(rocsparse ${rocsparse_source} ${rocsparse_headers_public}) add_library(roc::rocsparse ALIAS rocsparse) +# Target link libraries if(HIP_PLATFORM STREQUAL "hcc") - # Linker targets target_link_libraries(rocsparse PRIVATE hip::hip_hcc hip::hip_device hcc::hccshared) - - # GPU targets foreach(target ${AMDGPU_TARGETS}) target_link_libraries(rocsparse PRIVATE --amdgpu-target=${target}) endforeach() endif() -# Include directories +# Target include directories target_include_directories(rocsparse - PRIVATE $ - PUBLIC $ - $ - $ + PRIVATE $ + PUBLIC $ + $ + $ ) # Target properties -set_target_properties(rocsparse PROPERTIES VERSION ${rocsparse_VERSION} SOVERSION ${rocsparse_SOVERSION} CXX_EXTENSIONS NO) +if(HIP_PLATFORM STREQUAL "hcc") + set_target_properties(rocsparse PROPERTIES VERSION ${rocsparse_VERSION} SOVERSION ${rocsparse_SOVERSION}) + set_target_properties(rocsparse PROPERTIES CXX_VISIBILITY_PRESET "hidden" VISIBILITY_INLINES_HIDDEN ON) +endif() set_target_properties(rocsparse PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging") set_target_properties(rocsparse PROPERTIES DEBUG_POSTFIX "-d") -set_target_properties(rocsparse PROPERTIES CXX_VISIBILITY_PRESET "hidden" VISIBILITY_INLINES_HIDDEN ON) -generate_export_header(rocsparse EXPORT_FILE_NAME ${PROJECT_BINARY_DIR}/include/rocsparse_export.h) - -# Installation -rocm_install_targets( - TARGETS rocsparse - INCLUDE - ${CMAKE_SOURCE_DIR}/library/include - ${CMAKE_BINARY_DIR}/include - PREFIX rocsparse + +# Generate export header +include(GenerateExportHeader) +generate_export_header(rocsparse EXPORT_FILE_NAME ${PROJECT_BINARY_DIR}/include/rocsparse-export.h) + +# Install targets +rocm_install_targets(TARGETS rocsparse + INCLUDE + ${CMAKE_SOURCE_DIR}/library/include + ${CMAKE_BINARY_DIR}/include + PREFIX rocsparse ) -rocm_export_targets( - TARGETS rocsparse-targets - PREFIX rocsparse - DEPENDS PACKAGE hip - NAMESPACE roc:: +# Export targets +rocm_export_targets(TARGETS rocsparse-targets + PREFIX rocsparse + DEPENDS PACKAGE hip + NAMESPACE roc:: ) # Package specific CPACK vars diff --git a/library/include/rocsparse-auxiliary.h b/library/include/rocsparse-auxiliary.h new file mode 100644 index 00000000..64aec34d --- /dev/null +++ b/library/include/rocsparse-auxiliary.h @@ -0,0 +1,120 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +/*!\file + * \brief rocsparse-auxiliary.h provides auxilary functions in rocsparse +*/ + +#pragma once +#ifndef _ROCSPARSE_AUXILIARY_H_ +#define _ROCSPARSE_AUXILIARY_H_ + +#include "rocsparse-types.h" +#include "rocsparse-export.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************** + * \brief rocsparse_handle is a structure holding the rocsparse library context. + * It must be initialized using rocsparse_create_handle() + * and the returned handle must be passed + * to all subsequent library function calls. + * It should be destroyed at the end using rocsparse_destroy_handle(). + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_create_handle(rocsparse_handle *handle); + +/******************************************************************************** + * \brief destroy handle + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle); + +/******************************************************************************** + * \brief remove any streams from handle, and add one + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_set_stream(rocsparse_handle handle, hipStream_t stream); + +/******************************************************************************** + * \brief get stream [0] from handle + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t *stream); + +/******************************************************************************** + * \brief set rocsparse_pointer_mode + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, + rocsparse_pointer_mode pointer_mode); + +/******************************************************************************** + * \brief get rocsparse_pointer_mode + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, + rocsparse_pointer_mode *pointer_mode); + +/******************************************************************************** + * \brief Get rocSPARSE version + * version % 100 = patch level + * version / 100 % 1000 = minor version + * version / 100000 = major version + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version); + +/******************************************************************************** + * \brief rocsparse_create_mat_descr_t is a structure holding the rocsparse matrix + * descriptor. It must be initialized using rocsparse_create_mat_descr() + * and the retured handle must be passed to all subsequent library function + * calls that involve the matrix. + * It should be destroyed at the end using rocsparse_destroy_mat_descr(). + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descrA); + +/******************************************************************************** + * \brief destroy matrix descriptor + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descrA); + +/******************************************************************************** + * \brief Set the index base of the matrix descriptor. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descrA, + rocsparse_index_base base); + +/******************************************************************************** + * \brief Returns the index base of the matrix descriptor. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr descrA); + +/******************************************************************************** + * \brief Set the matrix type of the matrix descriptor. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descrA, + rocsparse_matrix_type type); + +/******************************************************************************** + * \brief Returns the matrix type of the matrix descriptor. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descrA); + +#ifdef __cplusplus +} +#endif + +#endif /* _ROCSPARSE_AUXILIARY_H_ */ diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h new file mode 100644 index 00000000..090b6120 --- /dev/null +++ b/library/include/rocsparse-functions.h @@ -0,0 +1,191 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +/*!\file + * \brief rocsparse-functions.h provides Sparse Linear Algebra Subprograms + * of Level 1, 2 and 3, using HIP optimized for AMD HCC-based GPU hardware. + * This library can also run on CUDA-based NVIDIA GPUs. +*/ + +#pragma once +#ifndef _ROCSPARSE_FUNCTIONS_H_ +#define _ROCSPARSE_FUNCTIONS_H_ + +#include "rocsparse-types.h" +#include "rocsparse-export.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * =========================================================================== + * level 1 SPARSE + * =========================================================================== + */ + + /*! \brief SPARSE Level 1 API + + \details + + @param[in] + + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_saxpyi(rocsparse_handle handle, + rocsparse_int nnz, + const float *alpha, + const float *xVal, + const rocsparse_int *xInd, + float *y, + rocsparse_index_base idxBase); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_daxpyi(rocsparse_handle handle, + rocsparse_int nnz, + const double *alpha, + const double *xVal, + const rocsparse_int *xInd, + double *y, + rocsparse_index_base idxBase); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_caxpyi(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_float_complex *alpha, + const rocsparse_float_complex *xVal, + const rocsparse_int *xInd, + rocsparse_float_complex *y, + rocsparse_index_base idxBase); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_double_complex *alpha, + const rocsparse_double_complex *xVal, + const rocsparse_int *xInd, + rocsparse_double_complex *y, + rocsparse_index_base idxBase); +*/ + +/* + * =========================================================================== + * level 2 SPARSE + * =========================================================================== + */ + +/*! \brief SPARSE Level 2 API + + \details + csrmv multiplies the dense vector x[i] with scalar alpha and sparse m x n + matrix A that is defined in CSR storage format and add the result to y[i] + that is multiplied by beta, for i = 1 , … , n + + y := alpha * op(A) * x + beta * y, + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + transA operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + alpha scalar alpha. + @param[in] + descrA descriptor of A. + @param[in] + csrValA array of nnz elements of A. + @param[in] + csrRowPtrA array of m+1 elements that point to the start + of every row of A. + @param[in] + csrColIndA array of nnz elements containing the column indices of A. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float *alpha, + const rocsparse_mat_descr descrA, + const float *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const float *x, + const float *beta, + float *y); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double *alpha, + const rocsparse_mat_descr descrA, + const double *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const double *x, + const double *beta, + double *y); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_float_complex *alpha, + const rocsparse_mat_descr descrA, + const rocsparse_float_complex *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const rocsparse_float_complex *x, + const rocsparse_float_complex *beta, + rocsparse_float_complex *y); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_double_complex *alpha, + const rocsparse_mat_descr descrA, + const rocsparse_double_complex *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const rocsparse_double_complex *x, + const rocsparse_double_complex *beta, + rocsparse_double_complex *y); +*/ +/* + * =========================================================================== + * level 3 SPARSE + * =========================================================================== + */ + +#ifdef __cplusplus +} +#endif + +#endif // _ROCSPARSE_FUNCTIONS_H_ diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h new file mode 100644 index 00000000..c21bea10 --- /dev/null +++ b/library/include/rocsparse-types.h @@ -0,0 +1,87 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +/*! \file + * \brief rocsparse-types.h defines data types used by rocsparse + */ + +#pragma once +#ifndef _ROCSPARSE_TYPES_H_ +#define _ROCSPARSE_TYPES_H_ + +#include + +/*! \brief To specify whether int32 or int64 is used + */ +#if defined(rocsparse_ILP64) +typedef int64_t rocsparse_int; +#else +typedef int32_t rocsparse_int; +#endif + +typedef struct _rocsparse_handle *rocsparse_handle; +typedef struct _rocsparse_mat_descr *rocsparse_mat_descr; + +#ifdef __cplusplus +extern "C" { +#endif + +/* ============================================================================================ */ + +/*! parameter constants. */ + +/*! \brief Used to specify whether the matrix is to be transposed or not. */ +typedef enum rocsparse_operation_ { + rocsparse_operation_none = 111, /**< Operate with the matrix. */ + rocsparse_operation_transpose = 112, /**< Operate with the transpose of the matrix. */ + rocsparse_operation_conjugate_transpose = 113 /**< Operate with the conjugate transpose of the matrix. */ +} rocsparse_operation; + +/*! \brief Used to specify the matrix index base. */ +typedef enum rocsparse_index_base_ { + rocsparse_index_base_zero = 0, + rocsparse_index_base_one = 1 +} rocsparse_index_base; + +/*! \brief Used to specify the matrix type. */ +typedef enum rocsparse_matrix_type_ { + rocsparse_matrix_type_general = 0, + rocsparse_matrix_type_symmetric = 1, + rocsparse_matrix_type_hermitian = 2 +} rocsparse_matrix_type; + +/* ============================================================================================ */ +/** + * @brief rocsparse status codes definition + */ +typedef enum rocsparse_status_ { + rocsparse_status_success = 0, /**< success */ + rocsparse_status_invalid_handle = 1, /**< handle not initialized, invalid or null */ + rocsparse_status_not_implemented = 2, /**< function is not implemented */ + rocsparse_status_invalid_pointer = 3, /**< invalid pointer parameter */ + rocsparse_status_invalid_size = 4, /**< invalid size parameter */ + rocsparse_status_memory_error = 5, /**< failed internal memory allocation, copy or dealloc */ + rocsparse_status_internal_error = 6, /**< other internal library failure */ + rocsparse_status_invalid_value = 7, /**< invalid value parameter */ + rocsparse_status_arch_mismatch = 8 /**< device arch is not supported */ +} rocsparse_status; + +/*! \brief Indicates the pointer is device pointer or host pointer */ +typedef enum rocsparse_pointer_mode_ { + rocsparse_pointer_mode_host = 0, + rocsparse_pointer_mode_device = 1 +} rocsparse_pointer_mode; + +/*! \brief Indicates if layer is active with bitmask*/ +typedef enum rocsparse_layer_mode { + rocsparse_layer_mode_none = 0b0000000000, + rocsparse_layer_mode_log_trace = 0b0000000001, + rocsparse_layer_mode_log_bench = 0b0000000010, +} rocsparse_layer_mode; + +#ifdef __cplusplus +} +#endif + +#endif // _ROCSPARSE_TYPES_H_ diff --git a/library/include/rocsparse_version.h.in b/library/include/rocsparse-version.h.in similarity index 61% rename from library/include/rocsparse_version.h.in rename to library/include/rocsparse-version.h.in index 026b6d4c..ebfc8065 100644 --- a/library/include/rocsparse_version.h.in +++ b/library/include/rocsparse-version.h.in @@ -1,13 +1,18 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ -#ifndef ROCSPARSE_VERSION_H_ -#define ROCSPARSE_VERSION_H_ +/*!\file + * \brief rocsparse-version.h provides the configured version and settings + */ + +#pragma once +#ifndef _ROCSPARSE_VERSION_H_ +#define _ROCSPARSE_VERSION_H_ #define ROCSPARSE_VERSION_MAJOR @rocsparse_VERSION_MAJOR@ #define ROCSPARSE_VERSION_MINOR @rocsparse_VERSION_MINOR@ #define ROCSPARSE_VERSION_PATCH @rocsparse_VERSION_PATCH@ #define ROCSPARSE_VERSION_TWEAK @rocsparse_VERSION_TWEAK@ -#endif // ROCSPARSE_VERSION_H_ +#endif // _ROCSPARSE_VERSION_H_ diff --git a/library/include/rocsparse.h b/library/include/rocsparse.h index c69bee31..ea703562 100644 --- a/library/include/rocsparse.h +++ b/library/include/rocsparse.h @@ -1,270 +1,19 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ -#ifndef ROCSPARSE_H_ -#define ROCSPARSE_H_ - -/* !\file - * \brief rocsparse.h exposes a common interface that provides Basic Linear - * Algebra Subroutines for sparse computation using HIP optimized AMD HCC- - * based GPU hardware. This library can also run on CUDA-based NVIDIA GPUs. - */ - -#include "rocsparse_version.h" -#include "rocsparse_export.h" - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/*! \brief rocsparse status codes definition. */ -typedef enum { - ROCSPARSE_STATUS_SUCCESS = 0, - ROCSPARSE_STATUS_NOT_INITIALIZED = 1, - ROCSPARSE_STATUS_ALLOC_FAILED = 2, - ROCSPARSE_STATUS_INVALID_VALUE = 3, - ROCSPARSE_STATUS_ARCH_MISMATCH = 4, - ROCSPARSE_STATUS_MAPPING_ERROR = 5, - ROCSPARSE_STATUS_EXECUTION_FAILED = 6, - ROCSPARSE_STATUS_INTERNAL_ERROR = 7, - ROCSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8, - ROCSPARSE_STATUS_ZERO_PIVOT = 9, - ROCSPARSE_STATUS_INVALID_POINTER = 10, - ROCSPARSE_STATUS_INVALID_SIZE = 11, - ROCSPARSE_STATUS_MEMORY_ERROR = 12, - ROCSPARSE_STATUS_INVALID_HANDLE = 13 -} rocsparseStatus_t; - -struct rocsparseContext; -typedef struct rocsparseContext *rocsparseHandle_t; - -struct rocsparseMatDescr; -typedef struct rocsparseMatDescr *rocsparseMatDescr_t; - -/*! \brief Used to specify whether the matrix is to be transposed or not. */ -typedef enum { - ROCSPARSE_OPERATION_NON_TRANSPOSE = 0, - ROCSPARSE_OPERATION_TRANSPOSE = 1, - ROCSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 -} rocsparseOperation_t; - -/*! \brief Indicates wether the pointer is device or host pointer. */ -typedef enum { - ROCSPARSE_POINTER_MODE_HOST = 0, - ROCSPARSE_POINTER_MODE_DEVICE = 1 -} rocsparsePointerMode_t; - -/*! \brief Used to specify the matrix index base. */ -typedef enum { - ROCSPARSE_INDEX_BASE_ZERO = 0, - ROCSPARSE_INDEX_BASE_ONE = 1 -} rocsparseIndexBase_t; - -/*! \brief Used to specify the matrix type. */ -typedef enum { - ROCSPARSE_MATRIX_TYPE_GENERAL = 0, - ROCSPARSE_MATRIX_TYPE_SYMMETRIC = 1, - ROCSPARSE_MATRIX_TYPE_HERMITIAN = 2 -} rocsparseMatrixType_t; - -/*! \brief Indicates if layer is active with bitmask. */ -typedef enum { - ROCSPARSE_LAYER_MODE_NONE = 0b0000000000, - ROCSPARSE_LAYER_MODE_LOG_TRACE = 0b0000000001, - ROCSPARSE_LAYER_MODE_LOG_BENCH = 0b0000000010 -} rocsparseLayerMode_t; - - - - -/******************************************************************************** - * \brief rocsparseHandle_t is a structure holding the rocsparse library context. - * It must be initialized using rocsparseCreate() - * and the returned handle must be passed - * to all subsequent library function calls. - * It should be destroyed at the end using rocsparseDestroy(). - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle); - -/******************************************************************************** - * \brief destroy handle - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle); - -/******************************************************************************** - * \brief rocsparseCreateMatDescr_t is a structure holding the rocsparse matrix - * descriptor. It must be initialized using rocsparseCreateMatDescr() - * and the retured handle must be passed to all subsequent library function - * calls that involve the matrix. - * It should be destroyed at the end using rocsparseDestroyMatDescr(). - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA); - -/******************************************************************************** - * \brief destroy matrix descriptor - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA); - -/******************************************************************************** - * \brief Set the index base of the matrix descriptor. - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseSetMatIndexBase(rocsparseMatDescr_t descrA, - rocsparseIndexBase_t base); - -/******************************************************************************** - * \brief Returns the index base of the matrix descriptor. - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseIndexBase_t rocsparseGetMatIndexBase(const rocsparseMatDescr_t descrA); - -/******************************************************************************** - * \brief Set the matrix type of the matrix descriptor. - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseSetMatType(rocsparseMatDescr_t descrA, - rocsparseMatrixType_t type); - -/******************************************************************************** - * \brief Returns the matrix type of the matrix descriptor. - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseMatrixType_t rocsparseGetMatType(const rocsparseMatDescr_t descrA); - -/******************************************************************************** - * \brief Indicates whether the scalar value pointers are on the host or device. - * Set pointer mode, can be host or device - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseSetPointerMode(rocsparseHandle_t handle, - rocsparsePointerMode_t mode); -/******************************************************************************** - * \brief Get pointer mode, can be host or device. - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, - rocsparsePointerMode_t *mode); - -/******************************************************************************** - *! \brief Set rocsparse stream used for all subsequent library function calls. - * If not set, all hip kernels will take the default NULL stream. - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseSetStream(rocsparseHandle_t handle, - hipStream_t streamId); - -/******************************************************************************** - *! \brief Get rocsparse stream used for all subsequent library function calls. - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseGetStream(rocsparseHandle_t handle, - hipStream_t *streamId); - -/******************************************************************************** - * \brief Get rocSPARSE version - * version % 100 = patch level - * version / 100 % 1000 = minor version - * version / 100000 = major version - *******************************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseGetVersion(rocsparseHandle_t handle, int *version); - - - -/* - * =========================================================================== - * level 1 SPARSE - * =========================================================================== - */ - - - -/* - * =========================================================================== - * level 2 SPARSE - * =========================================================================== +/*!\file + * \brief rocsparse.h includes other *.h and exposes a common interface */ -/*! \brief SPARSE Level 2 API - - \details - csrmv multiplies the dense vector x[i] with scalar alpha and sparse m x n - matrix A that is defined in CSR storage format and add the result to y[i] - that is multiplied by beta, for i = 1 , … , n - - y := alpha * op(A) * x + beta * y, - - @param[in] - handle rocsparse_handle. - handle to the rocsparse library context queue. - @param[in] - transA operation type of A. - @param[in] - m number of rows of A. - @param[in] - n number of columns of A. - @param[in] - nnz number of non-zero entries of A. - @param[in] - alpha scalar alpha. - @param[in] - descrA descriptor of A. - @param[in] - csrValA array of nnz elements of A. - @param[in] - csrRowPtrA array of m+1 elements that point to the start - of every row of A. - @param[in] - csrColIndA array of nnz elements containing the column indices of A. - @param[in] - x array of n elements (op(A) = A) or m elements (op(A) = A^T or - op(A) = A^H). - @param[in] - beta scalar beta. - @param[inout] - y array of m elements (op(A) = A) or n elements (op(A) = A^T or - op(A) = A^H). - - ********************************************************************/ -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseScsrmv(rocsparseHandle_t handle, - rocsparseOperation_t transA, - int m, - int n, - int nnz, - const float *alpha, - const rocsparseMatDescr_t descrA, - const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const float *x, - const float *beta, - float *y); - -ROCSPARSE_EXPORT -rocsparseStatus_t rocsparseDcsrmv(rocsparseHandle_t handle, - rocsparseOperation_t transA, - int m, - int n, - int nnz, - const double *alpha, - const rocsparseMatDescr_t descrA, - const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const double *x, - const double *beta, - double *y); +#pragma once +#ifndef _ROCSPARSE_H_ +#define _ROCSPARSE_H_ -#ifdef __cplusplus -} -#endif +#include "rocsparse-export.h" +#include "rocsparse-version.h" +#include "rocsparse-types.h" +#include "rocsparse-auxiliary.h" +#include "rocsparse-functions.h" -#endif // ROCSPARSE_H_ +#endif // _ROCSPARSE_H_ diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 3cf082a1..88655ac9 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -4,11 +4,9 @@ # rocSPARSE source set(rocsparse_source - src/context.cpp - src/matrix.cpp + src/handle.cpp src/status.cpp - - src/rocsparse_auxiliary.cpp + src/level1/rocsparse_axpyi.cpp src/level2/rocsparse_csrmv.cpp ) diff --git a/library/src/context.cpp b/library/src/context.cpp deleted file mode 100644 index e65be1e6..00000000 --- a/library/src/context.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#include "definitions.h" -#include "context.h" -#include "rocsparse.h" -#include "utility.h" - -rocsparseContext::rocsparseContext() -{ - // Default device is active device - THROW_IF_HIP_ERROR(hipGetDevice(&device)); - THROW_IF_HIP_ERROR(hipGetDeviceProperties(&properties, device)); - - // Device warp size - warp_size = properties.warpSize; - - // Layer mode - char *str_layer_mode; - if ((str_layer_mode = getenv("ROCSPARSE_LAYER")) == NULL) - { - layer_mode = ROCSPARSE_LAYER_MODE_NONE; - } - else - { - layer_mode = (rocsparseLayerMode_t) (atoi(str_layer_mode)); - } - - // Open log file - if (layer_mode & ROCSPARSE_LAYER_MODE_LOG_TRACE) - { - open_log_stream(&log_trace_os, &log_trace_ofs, "ROCSPARSE_LOG_TRACE_PATH"); - } - - // Open log_bench file - if (layer_mode & ROCSPARSE_LAYER_MODE_LOG_BENCH) - { - open_log_stream(&log_bench_os, &log_bench_ofs, "ROCSPARSE_LOG_BENCH_PATH"); - } -} - -rocsparseContext::~rocsparseContext() -{ - if (log_trace_ofs.is_open()) - { - log_trace_ofs.close(); - } - if (log_bench_ofs.is_open()) - { - log_bench_ofs.close(); - } -} - -rocsparseStatus_t rocsparseContext::setStream(hipStream_t streamId) -{ - // TODO check if stream is valid - stream = streamId; - return ROCSPARSE_STATUS_SUCCESS; -} - -rocsparseStatus_t rocsparseContext::getStream(hipStream_t *streamId) const -{ - *streamId = stream; - return ROCSPARSE_STATUS_SUCCESS; -} diff --git a/library/src/handle.cpp b/library/src/handle.cpp new file mode 100644 index 00000000..70b4cc7e --- /dev/null +++ b/library/src/handle.cpp @@ -0,0 +1,86 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "definitions.h" +#include "handle.h" +#include "logging.h" + +#include + +/******************************************************************************* + * constructor + ******************************************************************************/ +_rocsparse_handle::_rocsparse_handle() +{ + // Default device is active device + THROW_IF_HIP_ERROR(hipGetDevice(&device)); + THROW_IF_HIP_ERROR(hipGetDeviceProperties(&properties, device)); + + // Device warp size + warp_size = properties.warpSize; + + // Layer mode + char *str_layer_mode; + if ((str_layer_mode = getenv("ROCSPARSE_LAYER")) == NULL) + { + layer_mode = rocsparse_layer_mode_none; + } + else + { + layer_mode = (rocsparse_layer_mode) (atoi(str_layer_mode)); + } + + // Open log file + if (layer_mode & rocsparse_layer_mode_log_trace) + { + open_log_stream(&log_trace_os, &log_trace_ofs, "ROCSPARSE_LOG_TRACE_PATH"); + } + + // Open log_bench file + if (layer_mode & rocsparse_layer_mode_log_bench) + { + open_log_stream(&log_bench_os, &log_bench_ofs, "ROCSPARSE_LOG_BENCH_PATH"); + } +} + +/******************************************************************************* + * destructor + ******************************************************************************/ +_rocsparse_handle::~_rocsparse_handle() +{ + // Close log files + if (log_trace_ofs.is_open()) + { + log_trace_ofs.close(); + } + if (log_bench_ofs.is_open()) + { + log_bench_ofs.close(); + } +} + +/******************************************************************************* + * Exactly like cuSPARSE, rocSPARSE only uses one stream for one API routine + ******************************************************************************/ + +/******************************************************************************* + * set stream: + This API assumes user has already created a valid stream + Associate the following rocsparse API call with this user provided stream + ******************************************************************************/ +rocsparse_status _rocsparse_handle::set_stream(hipStream_t user_stream) +{ + // TODO check if stream is valid + stream = user_stream; + return rocsparse_status_success; +} + +/******************************************************************************* + * get stream + ******************************************************************************/ +rocsparse_status _rocsparse_handle::get_stream(hipStream_t* user_stream) const +{ + *user_stream = stream; + return rocsparse_status_success; +} diff --git a/library/src/include/context.h b/library/src/include/context.h deleted file mode 100644 index 49c68e45..00000000 --- a/library/src/include/context.h +++ /dev/null @@ -1,49 +0,0 @@ -/* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#ifndef ROCSPARSE_CONTEXT_H_ -#define ROCSPARSE_CONTEXT_H_ - -#include "rocsparse.h" - -#include -#include -#include - -/******************************************************************************* - * \brief rocsparseContext is a structure holding the rocsparse library context. -******************************************************************************/ -struct rocsparseContext -{ - // Constructor - rocsparseContext(); - // Destructor - ~rocsparseContext(); - - // Set stream - rocsparseStatus_t setStream(hipStream_t streamId); - // Get stream - rocsparseStatus_t getStream(hipStream_t *streamId) const; - - // device id - int device; - // device properties - hipDeviceProp_t properties; - // device warp size - int warp_size; - // stream ; default stream is system stream NULL - hipStream_t stream = 0; - // pointer mode ; default mode is host - rocsparsePointerMode_t pointer_mode = ROCSPARSE_POINTER_MODE_HOST; - // logging mode - rocsparseLayerMode_t layer_mode; - - // logging streams - std::ofstream log_trace_ofs; - std::ofstream log_bench_ofs; - std::ostream *log_trace_os; - std::ostream *log_bench_os; -}; - -#endif // ROCSPARSE_CONTEXT_H_ diff --git a/library/src/include/definitions.h b/library/src/include/definitions.h index 1f8cd5e6..6077d917 100644 --- a/library/src/include/definitions.h +++ b/library/src/include/definitions.h @@ -1,10 +1,10 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * Copyright 2018 Advanced Micro Devices, Inc. * * ************************************************************************ */ -#ifndef ROCSPARSE_DEFINITIONS_H_ -#define ROCSPARSE_DEFINITIONS_H_ +#ifndef DEFINITIONS_H +#define DEFINITIONS_H #include "status.h" @@ -44,4 +44,4 @@ } \ } -#endif // ROCSPARSE_DEFINITIONS_H_ +#endif // DEFINITIONS_H diff --git a/library/src/include/handle.h b/library/src/include/handle.h new file mode 100644 index 00000000..45532b82 --- /dev/null +++ b/library/src/include/handle.h @@ -0,0 +1,73 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef HANDLE_H +#define HANDLE_H + +#include "rocsparse.h" + +#include +#include +#include + +/******************************************************************************** + * \brief rocsparse_handle is a structure holding the rocsparse library context. + * It must be initialized using rocsparse_create_handle() + * and the returned handle must be passed + * to all subsequent library function calls. + * It should be destroyed at the end using rocsparse_destroy_handle(). + *******************************************************************************/ +struct _rocsparse_handle +{ + // Constructor + _rocsparse_handle(); + // Destructor + ~_rocsparse_handle(); + + // Set stream + rocsparse_status set_stream(hipStream_t user_stream); + // Get stream + rocsparse_status get_stream(hipStream_t *user_stream) const; + + // device id + rocsparse_int device; + // device properties + hipDeviceProp_t properties; + // device warp size + rocsparse_int warp_size; + // stream ; default stream is system stream NULL + hipStream_t stream = 0; + // pointer mode ; default mode is host + rocsparse_pointer_mode pointer_mode = rocsparse_pointer_mode_host; + // logging mode + rocsparse_layer_mode layer_mode; + + // logging streams + std::ofstream log_trace_ofs; + std::ofstream log_bench_ofs; + std::ostream *log_trace_os; + std::ostream *log_bench_os; +}; + +/******************************************************************************** + * \brief rocsparse_mat_descr is a structure holding the rocsparse matrix + * descriptor. It must be initialized using rocsparse_create_mat_descr() + * and the retured handle must be passed to all subsequent library function + * calls that involve the matrix. + * It should be destroyed at the end using rocsparse_destroy_mat_descr(). + *******************************************************************************/ +struct _rocsparse_mat_descr +{ + // Matrix type + rocsparse_matrix_type type = rocsparse_matrix_type_general; + // Fill mode TODO +// rocsparse_fill_mode fill; + // Diagonal type +// rocsparse_diag_type diag; + // Index base + rocsparse_index_base base = rocsparse_index_base_zero; +}; + +#endif // HANDLE_H diff --git a/library/src/include/logging.h b/library/src/include/logging.h index 48afaacd..d6c3a917 100644 --- a/library/src/include/logging.h +++ b/library/src/include/logging.h @@ -1,9 +1,10 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ -#ifndef ROCSPARSE_LOGGING_H_ -#define ROCSPARSE_LOGGING_H_ +#pragma once +#ifndef LOGGING_H +#define LOGGING_H #include #include @@ -207,4 +208,4 @@ void log_argument(std::ostream& os, H head) os << "\n" << head; } -#endif // ROCSPARSE_LOGGING_H_ +#endif // LOGGING_H diff --git a/library/src/include/matrix.h b/library/src/include/matrix.h deleted file mode 100644 index 7b75abd1..00000000 --- a/library/src/include/matrix.h +++ /dev/null @@ -1,22 +0,0 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#ifndef ROCSPARSE_MATRIX_H_ -#define ROCSPARSE_MATRIX_H_ - -#include "rocsparse.h" - -struct rocsparseMatDescr -{ - // Matrix type - rocsparseMatrixType_t type = ROCSPARSE_MATRIX_TYPE_GENERAL; - // Fill mode TODO -// rocsparseFillMode_t fill; - // Diagonal type -// rocsparseDiagType_t diag; - // Index base - rocsparseIndexBase_t base = ROCSPARSE_INDEX_BASE_ZERO; -}; - -#endif // ROCSPARSE_MATRIX_H_ diff --git a/library/src/include/status.h b/library/src/include/status.h index c10fdbdd..3dabc451 100644 --- a/library/src/include/status.h +++ b/library/src/include/status.h @@ -1,18 +1,18 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * Copyright 2018 Advanced Micro Devices, Inc. * * ************************************************************************ */ -#ifndef ROCSPARSE_STATUS_H_ -#define ROCSPARSE_STATUS_H_ +#ifndef STATUS_H +#define STATUS_H #include "rocsparse.h" #include /******************************************************************************* - * \brief convert hipError_t to rocblas_status + * \brief convert hipError_t to rocsparse_status ******************************************************************************/ -rocsparseStatus_t get_rocsparse_status_for_hip_status(hipError_t status); +rocsparse_status get_rocsparse_status_for_hip_status(hipError_t status); -#endif // ROCSPARSE_STATUS_H_ +#endif // STATUS_H diff --git a/library/src/include/utility.h b/library/src/include/utility.h index a4796009..91e98c32 100644 --- a/library/src/include/utility.h +++ b/library/src/include/utility.h @@ -1,12 +1,13 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ -#ifndef ROCSPARSE_UTILITY_H_ -#define ROCSPARSE_UTILITY_H_ +#pragma once +#ifndef UTILITY_H +#define UTILITY_H #include "rocsparse.h" -#include "context.h" +#include "handle.h" #include "logging.h" #include @@ -14,16 +15,16 @@ #include // if trace logging is turned on with -// (handle->layer_mode & rocblas_layer_mode_log_trace) == true +// (handle->layer_mode & rocsparse_layer_mode_log_trace) == true // then // log_function will call log_arguments to log function // arguments with a comma separator template -void log_trace(rocsparseHandle_t handle, H head, Ts&... xs) +void log_trace(rocsparse_handle handle, H head, Ts&... xs) { if(nullptr != handle) { - if(handle->layer_mode & ROCSPARSE_LAYER_MODE_LOG_TRACE) + if(handle->layer_mode & rocsparse_layer_mode_log_trace) { std::string comma_separator = ","; @@ -34,16 +35,16 @@ void log_trace(rocsparseHandle_t handle, H head, Ts&... xs) } // if bench logging is turned on with -// (handle->layer_mode & rocblas_layer_mode_log_bench) == true +// (handle->layer_mode & rocsparse_layer_mode_log_bench) == true // then // log_bench will call log_arguments to log a string that -// can be input to the executable rocblas-bench. +// can be input to the executable rocsparse-bench. template -void log_bench(rocsparseHandle_t handle, H head, std::string precision, Ts&... xs) +void log_bench(rocsparse_handle handle, H head, std::string precision, Ts&... xs) { if(nullptr != handle) { - if(handle->layer_mode & ROCSPARSE_LAYER_MODE_LOG_BENCH) + if(handle->layer_mode & rocsparse_layer_mode_log_bench) { std::string space_separator = " "; @@ -66,15 +67,15 @@ std::string replaceX(std::string input_string) std::replace(input_string.begin(), input_string.end(), 'X', 'd'); } /* - else if(std::is_same::value) + else if(std::is_same::value) { std::replace(input_string.begin(), input_string.end(), 'X', 'c'); } - else if(std::is_same::value) + else if(std::is_same::value) { std::replace(input_string.begin(), input_string.end(), 'X', 'z'); } - else if(std::is_same::value) + else if(std::is_same::value) { std::replace(input_string.begin(), input_string.end(), 'X', 'h'); } @@ -82,4 +83,4 @@ std::string replaceX(std::string input_string) return input_string; } -#endif // ROCSPARSE_UTILITY_H_ +#endif // UTILITY_H diff --git a/library/src/level1/rocsparse_axpyi.cpp b/library/src/level1/rocsparse_axpyi.cpp new file mode 100644 index 00000000..a4b84677 --- /dev/null +++ b/library/src/level1/rocsparse_axpyi.cpp @@ -0,0 +1,207 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" + +#include + +template +__device__ +void axpyi_device(rocsparse_int nnz, + T alpha, + const T *xVal, + const rocsparse_int *xInd, + T *y, + rocsparse_index_base idxBase) +{ + int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if (tid >= nnz) + { + return; + } + + y[xInd[tid]-idxBase] += alpha * xVal[tid]; +} + +template +__global__ +void axpyi_kernel_host_scalar(rocsparse_int nnz, + T alpha, + const T *xVal, + const rocsparse_int *xInd, + T *y, + rocsparse_index_base idxBase) +{ + axpyi_device(nnz, alpha, xVal, xInd, y, idxBase); +} + +template +__global__ +void axpyi_kernel_device_scalar(rocsparse_int nnz, + const T *alpha, + const T *xVal, + const rocsparse_int *xInd, + T *y, + rocsparse_index_base idxBase) +{ + axpyi_device(nnz, *alpha, xVal, xInd, y, idxBase); +} + +/*! \brief SPARSE Level 1 API + + \details + axpyi compute y := alpha * x + y + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + nnz number of non-zero entries in x + if nnz <= 0 quick return with rocsparse_status_success + @param[in] + alpha scalar alpha. + @param[in] + xVal pointer storing vector x non-zero values on the GPU. + @param[in] + xInd pointer storing vector x non-zero value indices on the GPU. + @param[inout] + y pointer storing y on the GPU. + @param[in] + idxBase specifies the index base. + + ********************************************************************/ +template +rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, + rocsparse_int nnz, + const T *alpha, + const T *xVal, + const rocsparse_int *xInd, + T *y, + rocsparse_index_base idxBase) +{ + // Check for valid handle + if (handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging // TODO bench logging + if (handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_axpyi"), + nnz, + *alpha, + (const void*&) xVal, + (const void*&) xInd, + (const void*&) y); + } + else + { + log_trace(handle, + replaceX("rocsparse_axpyi"), + nnz, + (const void*&) alpha, + (const void*&) xVal, + (const void*&) xInd, + (const void*&) y); + } + + // Check index base + if (idxBase != rocsparse_index_base_zero && + idxBase != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check size + if (nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if (alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (xVal == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (xInd == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if (nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define AXPYI_DIM 256 + dim3 axpyi_blocks((nnz-1)/AXPYI_DIM+1); + dim3 axpyi_threads(AXPYI_DIM); + + if (handle->pointer_mode == rocsparse_pointer_mode_device) + { + hipLaunchKernelGGL((axpyi_kernel_device_scalar), + axpyi_blocks, axpyi_threads, 0, stream, + nnz, alpha, xVal, xInd, y, idxBase); + } + else + { + if (*alpha == 0.0) + { + return rocsparse_status_success; + } + + hipLaunchKernelGGL((axpyi_kernel_host_scalar), + axpyi_blocks, axpyi_threads, 0, stream, + nnz, *alpha, xVal, xInd, y, idxBase); + } +#undef AXPYI_DIM + return rocsparse_status_success; +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" +rocsparse_status rocsparse_saxpyi(rocsparse_handle handle, + rocsparse_int nnz, + const float *alpha, + const float *xVal, + const rocsparse_int *xInd, + float *y, + rocsparse_index_base idxBase) +{ + return rocsparse_axpyi_template(handle, nnz, alpha, xVal, xInd, y, idxBase); +} + +extern "C" +rocsparse_status rocsparse_daxpyi(rocsparse_handle handle, + rocsparse_int nnz, + const double *alpha, + const double *xVal, + const rocsparse_int *xInd, + double *y, + rocsparse_index_base idxBase) +{ + return rocsparse_axpyi_template(handle, nnz, alpha, xVal, xInd, y, idxBase); +} diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 80ab89cf..90f58c52 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -3,58 +3,110 @@ * ************************************************************************ */ #include "rocsparse.h" -#include "context.h" +#include "handle.h" #include "utility.h" -#include "matrix.h" #include "csrmv_device.h" #include -template +template __global__ -void csrmvn_kernel_host_pointer(int m, T alpha, const int *ptr, const int *col, - const T *val, const T *x, T beta, T *y) +void csrmvn_kernel_host_pointer(rocsparse_int m, + T alpha, + const rocsparse_int *ptr, + const rocsparse_int *col, + const T *val, + const T *x, + T beta, + T *y) { csrmvn_general_device( m, alpha, ptr, col, val, x, beta, y); } -template +template __global__ -void csrmvn_kernel_device_pointer(int m, const T *alpha, const int *ptr, const int *col, - const T *val, const T *x, const T *beta, T *y) +void csrmvn_kernel_device_pointer(rocsparse_int m, + const T *alpha, + const rocsparse_int *ptr, + const rocsparse_int *col, + const T *val, + const T *x, + const T *beta, + T *y) { csrmvn_general_device( m, *alpha, ptr, col, val, x, *beta, y); } +/*! \brief SPARSE Level 2 API + + \details + csrmv multiplies the dense vector x[i] with scalar alpha and sparse m x n + matrix A that is defined in CSR storage format and add the result to y[i] + that is multiplied by beta, for i = 1 , … , n + + y := alpha * op(A) * x + beta * y, + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + transA operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + alpha scalar alpha. + @param[in] + descrA descriptor of A. + @param[in] + csrValA array of nnz elements of A. + @param[in] + csrRowPtrA array of m+1 elements that point to the start + of every row of A. + @param[in] + csrColIndA array of nnz elements containing the column indices of A. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ template -rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, - rocsparseOperation_t transA, - int m, - int n, - int nnz, - const T *alpha, - const rocsparseMatDescr_t descrA, - const T *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const T *x, - const T *beta, - T *y) +rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T *alpha, + const rocsparse_mat_descr descrA, + const T *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const T *x, + const T *beta, + T *y) { // Check for valid handle and matrix descriptor if (handle == nullptr) { - return ROCSPARSE_STATUS_NOT_INITIALIZED; + return rocsparse_status_invalid_handle; } else if (descrA == nullptr) { - return ROCSPARSE_STATUS_NOT_INITIALIZED; + return rocsparse_status_invalid_handle; } // Logging TODO bench logging - if (handle->pointer_mode == ROCSPARSE_POINTER_MODE_HOST) + if (handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, replaceX("rocsparse_Xcsrmv"), @@ -86,82 +138,81 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, } // Check matrix type - if (descrA->base != ROCSPARSE_INDEX_BASE_ZERO) + if (descrA->base != rocsparse_index_base_zero) { // TODO - return ROCSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED; + return rocsparse_status_not_implemented; } - if (descrA->type != ROCSPARSE_MATRIX_TYPE_GENERAL) + if (descrA->type != rocsparse_matrix_type_general) { // TODO - return ROCSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED; + return rocsparse_status_not_implemented; } // Check sizes if (m < 0) { - return ROCSPARSE_STATUS_INVALID_VALUE; + return rocsparse_status_invalid_size; } else if (n < 0) { - return ROCSPARSE_STATUS_INVALID_VALUE; + return rocsparse_status_invalid_size; } else if (nnz < 0) { - return ROCSPARSE_STATUS_INVALID_VALUE; + return rocsparse_status_invalid_size; } // Check pointer arguments if (csrValA == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } else if (csrRowPtrA == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } else if (csrColIndA == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } else if (x == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } else if (y == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } else if (alpha == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } else if (beta == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } // Quick return if possible if (m == 0 || n == 0 || nnz == 0) { - return ROCSPARSE_STATUS_SUCCESS; + return rocsparse_status_success; } // Stream hipStream_t stream = handle->stream; // Run different csrmv kernels - if (transA == ROCSPARSE_OPERATION_NON_TRANSPOSE) + if (transA == rocsparse_operation_none) { #define CSRMVN_DIM 512 - - int nnz_per_row = nnz / m; + rocsparse_int nnz_per_row = nnz / m; dim3 csrmvn_blocks((m-1)/CSRMVN_DIM+1); dim3 csrmvn_threads(CSRMVN_DIM); - if (handle->pointer_mode == ROCSPARSE_POINTER_MODE_DEVICE) + if (handle->pointer_mode == rocsparse_pointer_mode_device) { if (handle->warp_size == 32) { @@ -237,14 +288,14 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, } else { - return ROCSPARSE_STATUS_ARCH_MISMATCH; + return rocsparse_status_arch_mismatch; } } else { if (*alpha == 0.0 && *beta == 1.0) { - return ROCSPARSE_STATUS_SUCCESS; + return rocsparse_status_success; } if (handle->warp_size == 32) @@ -321,7 +372,7 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, } else { - return ROCSPARSE_STATUS_ARCH_MISMATCH; + return rocsparse_status_arch_mismatch; } } #undef CSRMVN_DIM @@ -329,9 +380,9 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, else { // TODO - return ROCSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED; + return rocsparse_status_not_implemented; } - return ROCSPARSE_STATUS_SUCCESS; + return rocsparse_status_success; } /* @@ -340,38 +391,42 @@ rocsparseStatus_t rocsparseTcsrmv(rocsparseHandle_t handle, * =========================================================================== */ -extern "C" rocsparseStatus_t rocsparseScsrmv(rocsparseHandle_t handle, - rocsparseOperation_t transA, - int m, - int n, - int nnz, - const float *alpha, - const rocsparseMatDescr_t descrA, - const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const float *x, - const float *beta, - float *y) +extern "C" +rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float *alpha, + const rocsparse_mat_descr descrA, + const float *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const float *x, + const float *beta, + float *y) { - return rocsparseTcsrmv(handle, transA, m, n, nnz, alpha, descrA, - csrValA, csrRowPtrA, csrColIndA, x, beta, y); + return rocsparse_csrmv_template( + handle, transA, m, n, nnz, alpha, descrA, + csrValA, csrRowPtrA, csrColIndA, x, beta, y); } -extern "C" rocsparseStatus_t rocsparseDcsrmv(rocsparseHandle_t handle, - rocsparseOperation_t transA, - int m, - int n, - int nnz, - const double *alpha, - const rocsparseMatDescr_t descrA, - const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const double *x, - const double *beta, - double *y) +extern "C" +rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double *alpha, + const rocsparse_mat_descr descrA, + const double *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const double *x, + const double *beta, + double *y) { - return rocsparseTcsrmv(handle, transA, m, n, nnz, alpha, descrA, - csrValA, csrRowPtrA, csrColIndA, x, beta, y); + return rocsparse_csrmv_template( + handle, transA, m, n, nnz, alpha, descrA, + csrValA, csrRowPtrA, csrColIndA, x, beta, y); } diff --git a/library/src/matrix.cpp b/library/src/matrix.cpp deleted file mode 100644 index a7e9cce2..00000000 --- a/library/src/matrix.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#include "matrix.h" -#include "rocsparse.h" - - - -/******************************************************************************** - * \brief rocsparseCreateMatDescr_t is a structure holding the rocsparse matrix - * descriptor. It must be initialized using rocsparseCreateMatDescr() - * and the retured handle must be passed to all subsequent library function - * calls that involve the matrix. - * It should be destroyed at the end using rocsparseDestroyMatDescr(). - *******************************************************************************/ -extern "C" -rocsparseStatus_t rocsparseCreateMatDescr(rocsparseMatDescr_t *descrA) -{ - if (descrA == nullptr) - { - return ROCSPARSE_STATUS_INVALID_POINTER; - } - else - { - // Allocate - try - { - *descrA = new rocsparseMatDescr; - } - catch(rocsparseStatus_t status) - { - return status; - } - return ROCSPARSE_STATUS_SUCCESS; - } -} - -/******************************************************************************** - * \brief destroy matrix descriptor - *******************************************************************************/ -extern "C" -rocsparseStatus_t rocsparseDestroyMatDescr(rocsparseMatDescr_t descrA) -{ - // Destruct - try - { - delete descrA; - } - catch(rocsparseStatus_t status) - { - return status; - } - return ROCSPARSE_STATUS_SUCCESS; -} - -/******************************************************************************** - * \brief Set the index base of the matrix descriptor. - *******************************************************************************/ -extern "C" -rocsparseStatus_t rocsparseSetMatIndexBase(rocsparseMatDescr_t descrA, - rocsparseIndexBase_t base) -{ - // Check if descriptor is valid - if (descrA == nullptr) - { - return ROCSPARSE_STATUS_INVALID_POINTER; - } - if (base != ROCSPARSE_INDEX_BASE_ZERO && - base != ROCSPARSE_INDEX_BASE_ONE) - { - return ROCSPARSE_STATUS_INVALID_VALUE; - } - descrA->base = base; - return ROCSPARSE_STATUS_SUCCESS; -} - -/******************************************************************************** - * \brief Returns the index base of the matrix descriptor. - *******************************************************************************/ -extern "C" -rocsparseIndexBase_t rocsparseGetMatIndexBase(const rocsparseMatDescr_t descrA) -{ - // If descriptor is invalid, default index base is returned - if (descrA == nullptr) - { - return ROCSPARSE_INDEX_BASE_ZERO; - } - return descrA->base; -} - -/******************************************************************************** - * \brief Set the matrix type of the matrix descriptor. - *******************************************************************************/ -extern "C" -rocsparseStatus_t rocsparseSetMatType(rocsparseMatDescr_t descrA, - rocsparseMatrixType_t type) -{ - // Check if descriptor is valid - if (descrA == nullptr) - { - return ROCSPARSE_STATUS_INVALID_POINTER; - } - if (type != ROCSPARSE_MATRIX_TYPE_GENERAL && - type != ROCSPARSE_MATRIX_TYPE_SYMMETRIC && - type != ROCSPARSE_MATRIX_TYPE_HERMITIAN) - { - return ROCSPARSE_STATUS_INVALID_VALUE; - } - descrA->type = type; - return ROCSPARSE_STATUS_SUCCESS; -} - -/******************************************************************************** - * \brief Returns the matrix type of the matrix descriptor. - *******************************************************************************/ -extern "C" -rocsparseMatrixType_t rocsparseGetMatType(const rocsparseMatDescr_t descrA) -{ - // If descriptor is invalid, default matrix type is returned - if (descrA == nullptr) - { - return ROCSPARSE_MATRIX_TYPE_GENERAL; - } - return descrA->type; -} diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index b317cbc7..f1f80592 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -2,41 +2,41 @@ * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ -#include "context.h" +#include "handle.h" #include "rocsparse.h" #include "utility.h" #include /******************************************************************************** - * \brief rocsparseHandle_t is a structure holding the rocsparse library context. - * It must be initialized using rocsparseCreate() + * \brief rocsparse_handle is a structure holding the rocsparse library context. + * It must be initialized using rocsparse_create_handle() * and the returned handle must be passed * to all subsequent library function calls. - * It should be destroyed at the end using rocsparseDestroy(). + * It should be destroyed at the end using rocsparse_destroy_handle(). *******************************************************************************/ extern "C" -rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) +rocsparse_status rocsparse_create_handle(rocsparse_handle *handle) { // Check if handle is valid if (handle == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } else { // Allocate try { - *handle = new rocsparseContext; - log_trace(*handle, "rocsparseCreate"); + *handle = new _rocsparse_handle(); + log_trace(*handle, "rocsparse_create_handle"); } - catch(rocsparseStatus_t status) + catch(rocsparse_status status) { return status; } - return ROCSPARSE_STATUS_SUCCESS; + return rocsparse_status_success; } } @@ -44,19 +44,19 @@ rocsparseStatus_t rocsparseCreate(rocsparseHandle_t *handle) * \brief destroy handle *******************************************************************************/ extern "C" -rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) +rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle) { - log_trace(handle, "rocsparseDestroy"); + log_trace(handle, "rocsparse_destroy_handle"); // Destruct try { delete handle; } - catch(rocsparseStatus_t status) + catch(rocsparse_status status) { return status; } - return ROCSPARSE_STATUS_SUCCESS; + return rocsparse_status_success; } /******************************************************************************** @@ -64,34 +64,34 @@ rocsparseStatus_t rocsparseDestroy(rocsparseHandle_t handle) * Set pointer mode, can be host or device *******************************************************************************/ extern "C" -rocsparseStatus_t rocsparseSetPointerMode(rocsparseHandle_t handle, - rocsparsePointerMode_t mode) +rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, + rocsparse_pointer_mode mode) { // Check if handle is valid if (handle == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } handle->pointer_mode = mode; - log_trace(handle, "rocsparseSetPointerMode", mode); - return ROCSPARSE_STATUS_SUCCESS; + log_trace(handle, "rocsparse_set_pointer_mode", mode); + return rocsparse_status_success; } /******************************************************************************** * \brief Get pointer mode, can be host or device. *******************************************************************************/ extern "C" -rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, - rocsparsePointerMode_t *mode) +rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, + rocsparse_pointer_mode *mode) { // Check if handle is valid if (handle == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } *mode = handle->pointer_mode; - log_trace(handle, "rocsparseGetPointerMode", *mode); - return ROCSPARSE_STATUS_SUCCESS; + log_trace(handle, "rocsparse_get_pointer_mode", *mode); + return rocsparse_status_success; } /******************************************************************************** @@ -99,32 +99,32 @@ rocsparseStatus_t rocsparseGetPointerMode(rocsparseHandle_t handle, * If not set, all hip kernels will take the default NULL stream. *******************************************************************************/ extern "C" -rocsparseStatus_t rocsparseSetStream(rocsparseHandle_t handle, - hipStream_t streamId) +rocsparse_status rocsparse_set_stream(rocsparse_handle handle, + hipStream_t stream_id) { // Check if handle is valid if (handle == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } - log_trace(handle, "rocsparseSetStream", streamId); - return handle->setStream(streamId); + log_trace(handle, "rocsparse_set_stream", stream_id); + return handle->set_stream(stream_id); } /******************************************************************************** *! \brief Get rocsparse stream used for all subsequent library function calls. *******************************************************************************/ extern "C" -rocsparseStatus_t rocsparseGetStream(rocsparseHandle_t handle, - hipStream_t *streamId) +rocsparse_status rocsparse_get_stream(rocsparse_handle handle, + hipStream_t *stream_id) { // Check if handle is valid if (handle == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } - log_trace(handle, "rocsparseGetStream", *streamId); - return handle->getStream(streamId); + log_trace(handle, "rocsparse_get_stream", *stream_id); + return handle->get_stream(stream_id); } /******************************************************************************** @@ -134,16 +134,134 @@ rocsparseStatus_t rocsparseGetStream(rocsparseHandle_t handle, * version / 100000 = major version *******************************************************************************/ extern "C" -rocsparseStatus_t rocsparseGetVersion(rocsparseHandle_t handle, int *version) +rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version) { // Check if handle is valid if (handle == nullptr) { - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; } *version = ROCSPARSE_VERSION_MAJOR * 100000 + ROCSPARSE_VERSION_MINOR * 100 + ROCSPARSE_VERSION_PATCH; - log_trace(handle, "rocsparseGetVersion", *version); - return ROCSPARSE_STATUS_SUCCESS; + log_trace(handle, "rocsparse_get_version", *version); + return rocsparse_status_success; +} + +/******************************************************************************** + * \brief rocsparse_create_mat_descr_t is a structure holding the rocsparse matrix + * descriptor. It must be initialized using rocsparse_create_mat_descr() + * and the retured handle must be passed to all subsequent library function + * calls that involve the matrix. + * It should be destroyed at the end using rocsparse_destroy_mat_descr(). + *******************************************************************************/ +extern "C" +rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descrA) +{ + if (descrA == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else + { + // Allocate + try + { + *descrA = new _rocsparse_mat_descr; + } + catch(rocsparse_status status) + { + return status; + } + return rocsparse_status_success; + } +} + +/******************************************************************************** + * \brief destroy matrix descriptor + *******************************************************************************/ +extern "C" +rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descrA) +{ + // Destruct + try + { + delete descrA; + } + catch(rocsparse_status status) + { + return status; + } + return rocsparse_status_success; +} + +/******************************************************************************** + * \brief Set the index base of the matrix descriptor. + *******************************************************************************/ +extern "C" +rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descrA, + rocsparse_index_base base) +{ + // Check if descriptor is valid + if (descrA == nullptr) + { + return rocsparse_status_invalid_pointer; + } + if (base != rocsparse_index_base_zero && + base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + descrA->base = base; + return rocsparse_status_success; +} + +/******************************************************************************** + * \brief Returns the index base of the matrix descriptor. + *******************************************************************************/ +extern "C" +rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr descrA) +{ + // If descriptor is invalid, default index base is returned + if (descrA == nullptr) + { + return rocsparse_index_base_zero; + } + return descrA->base; +} + +/******************************************************************************** + * \brief Set the matrix type of the matrix descriptor. + *******************************************************************************/ +extern "C" +rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descrA, + rocsparse_matrix_type type) +{ + // Check if descriptor is valid + if (descrA == nullptr) + { + return rocsparse_status_invalid_pointer; + } + if (type != rocsparse_matrix_type_general && + type != rocsparse_matrix_type_symmetric && + type != rocsparse_matrix_type_hermitian) + { + return rocsparse_status_invalid_value; + } + descrA->type = type; + return rocsparse_status_success; +} + +/******************************************************************************** + * \brief Returns the matrix type of the matrix descriptor. + *******************************************************************************/ +extern "C" +rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descrA) +{ + // If descriptor is invalid, default matrix type is returned + if (descrA == nullptr) + { + return rocsparse_matrix_type_general; + } + return descrA->type; } diff --git a/library/src/status.cpp b/library/src/status.cpp index 187dbcf9..a4bf803f 100644 --- a/library/src/status.cpp +++ b/library/src/status.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. + * Copyright 2018 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -9,38 +9,38 @@ #include /******************************************************************************* - * \brief convert hipError_t to rocblas_status + * \brief convert hipError_t to rocsparse_status * TODO - enumerate library calls to hip runtime, enumerate possible errors from those calls ******************************************************************************/ -rocsparseStatus_t get_rocsparse_status_for_hip_status(hipError_t status) +rocsparse_status get_rocsparse_status_for_hip_status(hipError_t status) { switch(status) { // success case hipSuccess: - return ROCSPARSE_STATUS_SUCCESS; + return rocsparse_status_success; // internal hip memory allocation case hipErrorMemoryAllocation: case hipErrorLaunchOutOfResources: - return ROCSPARSE_STATUS_MEMORY_ERROR; + return rocsparse_status_memory_error; // user-allocated hip memory case hipErrorInvalidDevicePointer: // hip memory - return ROCSPARSE_STATUS_INVALID_POINTER; + return rocsparse_status_invalid_pointer; // user-allocated device, stream, event case hipErrorInvalidDevice: case hipErrorInvalidResourceHandle: - return ROCSPARSE_STATUS_INVALID_HANDLE; + return rocsparse_status_invalid_handle; // library using hip incorrectly case hipErrorInvalidValue: - return ROCSPARSE_STATUS_INTERNAL_ERROR; + return rocsparse_status_internal_error; // hip runtime failing case hipErrorNoDevice: // no hip devices case hipErrorUnknown: - default: return ROCSPARSE_STATUS_INTERNAL_ERROR; + default: return rocsparse_status_internal_error; } } diff --git a/test/test_rocsparse_csrmv.cpp b/test/test_rocsparse_csrmv.cpp index 92f1b6af..4eb6b600 100644 --- a/test/test_rocsparse_csrmv.cpp +++ b/test/test_rocsparse_csrmv.cpp @@ -10,12 +10,12 @@ #include #define HIP_CHECK(x) ASSERT_EQ(x, hipSuccess) -#define ROCSPARSE_CHECK(x) ASSERT_EQ(x, ROCSPARSE_STATUS_SUCCESS) +#define ROCSPARSE_CHECK(x) ASSERT_EQ(x, rocsparse_status_success) -TEST(Tests, rocsparseScsrmv) +TEST(Tests, rocsparse_scsrmv) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); // Generate problem int *Aptr = NULL; @@ -39,8 +39,8 @@ TEST(Tests, rocsparseScsrmv) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -62,9 +62,9 @@ TEST(Tests, rocsparseScsrmv) HIP_CHECK(hipMemcpy(dy, y, sizeof(float)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseScsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host float *result = (float*) malloc(sizeof(float)*nrow); @@ -95,14 +95,14 @@ TEST(Tests, rocsparseScsrmv) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseDcsrmv) +TEST(Tests, rocsparse_dcsrmv) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); // Generate problem int *Aptr = NULL; @@ -126,8 +126,8 @@ TEST(Tests, rocsparseDcsrmv) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -151,9 +151,9 @@ TEST(Tests, rocsparseDcsrmv) //TODO analyse step - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host double *result = (double*) malloc(sizeof(double)*nrow); @@ -184,14 +184,14 @@ TEST(Tests, rocsparseDcsrmv) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseScsrmv_nos1) +TEST(Tests, rocsparse_scsrmv_nos1) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -234,8 +234,8 @@ TEST(Tests, rocsparseScsrmv_nos1) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -257,9 +257,9 @@ TEST(Tests, rocsparseScsrmv_nos1) HIP_CHECK(hipMemcpy(dy, y, sizeof(float)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseScsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host float *result = (float*) malloc(sizeof(float)*nrow); @@ -290,14 +290,14 @@ TEST(Tests, rocsparseScsrmv_nos1) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseScsrmv_nos2) +TEST(Tests, rocsparse_scsrmv_nos2) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -340,8 +340,8 @@ TEST(Tests, rocsparseScsrmv_nos2) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -363,9 +363,9 @@ TEST(Tests, rocsparseScsrmv_nos2) HIP_CHECK(hipMemcpy(dy, y, sizeof(float)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseScsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host float *result = (float*) malloc(sizeof(float)*nrow); @@ -396,14 +396,14 @@ TEST(Tests, rocsparseScsrmv_nos2) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseScsrmv_nos3) +TEST(Tests, rocsparse_scsrmv_nos3) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -446,8 +446,8 @@ TEST(Tests, rocsparseScsrmv_nos3) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -469,9 +469,9 @@ TEST(Tests, rocsparseScsrmv_nos3) HIP_CHECK(hipMemcpy(dy, y, sizeof(float)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseScsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host float *result = (float*) malloc(sizeof(float)*nrow); @@ -502,14 +502,14 @@ TEST(Tests, rocsparseScsrmv_nos3) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseScsrmv_nos4) +TEST(Tests, rocsparse_scsrmv_nos4) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -552,8 +552,8 @@ TEST(Tests, rocsparseScsrmv_nos4) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -575,9 +575,9 @@ TEST(Tests, rocsparseScsrmv_nos4) HIP_CHECK(hipMemcpy(dy, y, sizeof(float)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseScsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host float *result = (float*) malloc(sizeof(float)*nrow); @@ -608,14 +608,14 @@ TEST(Tests, rocsparseScsrmv_nos4) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseScsrmv_nos5) +TEST(Tests, rocsparse_scsrmv_nos5) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -658,8 +658,8 @@ TEST(Tests, rocsparseScsrmv_nos5) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -681,9 +681,9 @@ TEST(Tests, rocsparseScsrmv_nos5) HIP_CHECK(hipMemcpy(dy, y, sizeof(float)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseScsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host float *result = (float*) malloc(sizeof(float)*nrow); @@ -714,14 +714,14 @@ TEST(Tests, rocsparseScsrmv_nos5) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseScsrmv_nos6) +TEST(Tests, rocsparse_scsrmv_nos6) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -764,8 +764,8 @@ TEST(Tests, rocsparseScsrmv_nos6) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -787,9 +787,9 @@ TEST(Tests, rocsparseScsrmv_nos6) HIP_CHECK(hipMemcpy(dy, y, sizeof(float)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseScsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host float *result = (float*) malloc(sizeof(float)*nrow); @@ -820,14 +820,14 @@ TEST(Tests, rocsparseScsrmv_nos6) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseScsrmv_nos7) +TEST(Tests, rocsparse_scsrmv_nos7) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -870,8 +870,8 @@ TEST(Tests, rocsparseScsrmv_nos7) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -893,9 +893,9 @@ TEST(Tests, rocsparseScsrmv_nos7) HIP_CHECK(hipMemcpy(dy, y, sizeof(float)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseScsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_scsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host float *result = (float*) malloc(sizeof(float)*nrow); @@ -926,14 +926,14 @@ TEST(Tests, rocsparseScsrmv_nos7) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseDcsrmv_nos1) +TEST(Tests, rocsparse_dcsrmv_nos1) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -976,8 +976,8 @@ TEST(Tests, rocsparseDcsrmv_nos1) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -999,9 +999,9 @@ TEST(Tests, rocsparseDcsrmv_nos1) HIP_CHECK(hipMemcpy(dy, y, sizeof(double)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host double *result = (double*) malloc(sizeof(double)*nrow); @@ -1032,14 +1032,14 @@ TEST(Tests, rocsparseDcsrmv_nos1) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseDcsrmv_nos2) +TEST(Tests, rocsparse_dcsrmv_nos2) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -1082,8 +1082,8 @@ TEST(Tests, rocsparseDcsrmv_nos2) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -1105,9 +1105,9 @@ TEST(Tests, rocsparseDcsrmv_nos2) HIP_CHECK(hipMemcpy(dy, y, sizeof(double)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host double *result = (double*) malloc(sizeof(double)*nrow); @@ -1138,14 +1138,14 @@ TEST(Tests, rocsparseDcsrmv_nos2) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseDcsrmv_nos3) +TEST(Tests, rocsparse_dcsrmv_nos3) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -1188,8 +1188,8 @@ TEST(Tests, rocsparseDcsrmv_nos3) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -1211,9 +1211,9 @@ TEST(Tests, rocsparseDcsrmv_nos3) HIP_CHECK(hipMemcpy(dy, y, sizeof(double)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host double *result = (double*) malloc(sizeof(double)*nrow); @@ -1244,14 +1244,14 @@ TEST(Tests, rocsparseDcsrmv_nos3) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseDcsrmv_nos4) +TEST(Tests, rocsparse_dcsrmv_nos4) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -1294,8 +1294,8 @@ TEST(Tests, rocsparseDcsrmv_nos4) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -1317,9 +1317,9 @@ TEST(Tests, rocsparseDcsrmv_nos4) HIP_CHECK(hipMemcpy(dy, y, sizeof(double)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host double *result = (double*) malloc(sizeof(double)*nrow); @@ -1350,14 +1350,14 @@ TEST(Tests, rocsparseDcsrmv_nos4) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseDcsrmv_nos5) +TEST(Tests, rocsparse_dcsrmv_nos5) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -1400,8 +1400,8 @@ TEST(Tests, rocsparseDcsrmv_nos5) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -1423,9 +1423,9 @@ TEST(Tests, rocsparseDcsrmv_nos5) HIP_CHECK(hipMemcpy(dy, y, sizeof(double)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host double *result = (double*) malloc(sizeof(double)*nrow); @@ -1456,14 +1456,14 @@ TEST(Tests, rocsparseDcsrmv_nos5) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseDcsrmv_nos6) +TEST(Tests, rocsparse_dcsrmv_nos6) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -1506,8 +1506,8 @@ TEST(Tests, rocsparseDcsrmv_nos6) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -1529,9 +1529,9 @@ TEST(Tests, rocsparseDcsrmv_nos6) HIP_CHECK(hipMemcpy(dy, y, sizeof(double)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host double *result = (double*) malloc(sizeof(double)*nrow); @@ -1562,14 +1562,14 @@ TEST(Tests, rocsparseDcsrmv_nos6) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } -TEST(Tests, rocsparseDcsrmv_nos7) +TEST(Tests, rocsparse_dcsrmv_nos7) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); int nrow; int ncol; @@ -1612,8 +1612,8 @@ TEST(Tests, rocsparseDcsrmv_nos7) } // Matrix descriptor - rocsparseMatDescr_t descrA; - ROCSPARSE_CHECK(rocsparseCreateMatDescr(&descrA)); + rocsparse_mat_descr descrA; + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descrA)); // Offload data to device int *dAptr = NULL; @@ -1635,9 +1635,9 @@ TEST(Tests, rocsparseDcsrmv_nos7) HIP_CHECK(hipMemcpy(dy, y, sizeof(double)*nrow, hipMemcpyHostToDevice)); //TODO analyse step - ROCSPARSE_CHECK(rocsparseDcsrmv(handle, ROCSPARSE_OPERATION_NON_TRANSPOSE, - nrow, nrow, nnz, &alpha, descrA, dAval, - dAptr, dAcol, dx, &beta, dy)); + ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, rocsparse_operation_none, + nrow, nrow, nnz, &alpha, descrA, dAval, + dAptr, dAcol, dx, &beta, dy)); // Copy result to host double *result = (double*) malloc(sizeof(double)*nrow); @@ -1668,8 +1668,8 @@ TEST(Tests, rocsparseDcsrmv_nos7) HIP_CHECK(hipFree(dx)); HIP_CHECK(hipFree(dy)); - ROCSPARSE_CHECK(rocsparseDestroyMatDescr(descrA)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } diff --git a/test/test_rocsparse_handle.cpp b/test/test_rocsparse_handle.cpp index 395de9c8..18ac1dcb 100644 --- a/test/test_rocsparse_handle.cpp +++ b/test/test_rocsparse_handle.cpp @@ -5,11 +5,11 @@ #include #include -#define ROCSPARSE_CHECK(x) ASSERT_EQ(x, ROCSPARSE_STATUS_SUCCESS) +#define ROCSPARSE_CHECK(x) ASSERT_EQ(x, rocsparse_status_success) TEST(Tests, handle) { - rocsparseHandle_t handle; - ROCSPARSE_CHECK(rocsparseCreate(&handle)); - ROCSPARSE_CHECK(rocsparseDestroy(handle)); + rocsparse_handle handle; + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); } From 7867ab8d9d45e0486e1ed7eca3c573153e9fee85 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 4 May 2018 12:25:39 +0200 Subject: [PATCH 023/304] clients added --- CMakeLists.txt | 21 +- benchmark/CMakeLists.txt | 35 - benchmark/benchmark_csrmv_mtx.cpp | 256 --- benchmark/benchmark_spmv.cpp | 232 --- clients/CMakeLists.txt | 49 + clients/benchmarks/CMakeLists.txt | 56 + clients/benchmarks/client.cpp | 120 ++ clients/common/arg_check.cpp | 79 + .../rocsparse_template_specialization.cpp | 35 + clients/common/unit.cpp | 69 + clients/common/utility.cpp | 96 + clients/include/arg_check.hpp | 23 + clients/include/rocsparse.hpp | 24 + clients/include/rocsparse_test_unique_ptr.hpp | 64 + clients/include/testing_axpyi.hpp | 246 +++ clients/include/testing_csrmv.hpp | 248 +++ clients/include/unit.hpp | 32 + clients/include/utility.hpp | 182 ++ {example => clients/samples}/CMakeLists.txt | 22 +- .../samples/example_handle.cpp | 0 clients/tests/CMakeLists.txt | 51 + clients/tests/rocsparse_gtest_main.cpp | 33 + clients/tests/test_axpyi.cpp | 64 + clients/tests/test_csrmv.cpp | 63 + cmake/Dependencies.cmake | 10 +- test/CMakeLists.txt | 39 - test/test_device_apis.cpp | 97 - test/test_rocsparse_csrmv.cpp | 1675 ----------------- test/test_rocsparse_handle.cpp | 15 - utils/utils.h | 264 --- 30 files changed, 1554 insertions(+), 2646 deletions(-) delete mode 100644 benchmark/CMakeLists.txt delete mode 100644 benchmark/benchmark_csrmv_mtx.cpp delete mode 100644 benchmark/benchmark_spmv.cpp create mode 100644 clients/CMakeLists.txt create mode 100644 clients/benchmarks/CMakeLists.txt create mode 100644 clients/benchmarks/client.cpp create mode 100644 clients/common/arg_check.cpp create mode 100644 clients/common/rocsparse_template_specialization.cpp create mode 100644 clients/common/unit.cpp create mode 100644 clients/common/utility.cpp create mode 100644 clients/include/arg_check.hpp create mode 100644 clients/include/rocsparse.hpp create mode 100644 clients/include/rocsparse_test_unique_ptr.hpp create mode 100644 clients/include/testing_axpyi.hpp create mode 100644 clients/include/testing_csrmv.hpp create mode 100644 clients/include/unit.hpp create mode 100644 clients/include/utility.hpp rename {example => clients/samples}/CMakeLists.txt (62%) rename example/rocsparse_handle.cpp => clients/samples/example_handle.cpp (100%) create mode 100644 clients/tests/CMakeLists.txt create mode 100644 clients/tests/rocsparse_gtest_main.cpp create mode 100644 clients/tests/test_axpyi.cpp create mode 100644 clients/tests/test_csrmv.cpp delete mode 100644 test/CMakeLists.txt delete mode 100644 test/test_device_apis.cpp delete mode 100644 test/test_rocsparse_csrmv.cpp delete mode 100644 test/test_rocsparse_handle.cpp delete mode 100644 utils/utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f34871e5..70573371 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,9 +41,9 @@ set(CMAKE_CXX_EXTENSIONS OFF) # Build options option(BUILD_SHARED_LIBS "Build rocSPARSE as a shared library" ON) -option(BUILD_TEST "Build tests (requires googletest)" OFF) -option(BUILD_BENCHMARK "Build benchmarks (requires googlebenchmark)" OFF) -option(BUILD_EXAMPLE "Build examples" ON) +option(BUILD_CLIENTS_TESTS "Build tests (requires googletest)" OFF) +option(BUILD_CLIENTS_BENCHMARKS "Build benchmarks (requires googlebenchmark)" OFF) +option(BUILD_CLIENTS_SAMPLES "Build examples" ON) option(BUILD_VERBOSE "Output additional build information" OFF) # Dependencies @@ -55,18 +55,7 @@ set(AMDGPU_TARGETS gfx803;gfx900 CACHE STRING "List of specific machine types fo # rocSPARSE library add_subdirectory(library) -# Tests -if(BUILD_TEST) +if(BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) enable_testing() - add_subdirectory(test) -endif() - -# Benchmarks -if(BUILD_BENCHMARK) - add_subdirectory(benchmark) -endif() - -# Examples -if(BUILD_EXAMPLE) - add_subdirectory(example) + add_subdirectory(clients) endif() diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt deleted file mode 100644 index ad985896..00000000 --- a/benchmark/CMakeLists.txt +++ /dev/null @@ -1,35 +0,0 @@ -# ######################################################################## -# Copyright 2016 Advanced Micro Devices, Inc. -# ######################################################################## - -# Function to add rocsparse benchmarks -function(add_rocsparse_benchmark BENCHMARK_SOURCE) - get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) - add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) - target_include_directories(${BENCHMARK_TARGET} SYSTEM - PUBLIC - ${CMAKE_SOURCE_DIR}/utils - ) - if(HIP_PLATFORM STREQUAL "hcc") - target_link_libraries(${BENCHMARK_TARGET} - PRIVATE - rocsparse - hip::hip_hcc - benchmark::benchmark - ) - foreach(amdgpu_target ${AMDGPU_TARGETS}) - target_link_libraries(${BENCHMARK_TARGET} - PRIVATE - --amdgpu-target=${amdgpu_target} - ) - endforeach() - endif() - set_target_properties(${BENCHMARK_TARGET} - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark" - ) -endfunction() - -# Benchmarks -add_rocsparse_benchmark(benchmark_spmv.cpp) -add_rocsparse_benchmark(benchmark_csrmv_mtx.cpp) diff --git a/benchmark/benchmark_csrmv_mtx.cpp b/benchmark/benchmark_csrmv_mtx.cpp deleted file mode 100644 index 068f3446..00000000 --- a/benchmark/benchmark_csrmv_mtx.cpp +++ /dev/null @@ -1,256 +0,0 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#include "benchmark/benchmark.h" -#include "utils.h" - -#include -#include - -#define HIP_CHECK(stat) \ -{ \ - hipError_t err = stat; \ - if (err != hipSuccess) \ - { \ - fprintf(stderr, "HIP error: %d line: %d\n", err, __LINE__); \ - exit(stat); \ - } \ -} - -#define ROCSPARSE_CHECK(stat) \ -{ \ - rocsparse_status err = stat; \ - if (err != rocsparse_status_success) \ - { \ - fprintf(stderr, "ROCSPARSE error: %d line: %d\n", err, __LINE__); \ - exit(stat); \ - } \ -} - -void csrmv(rocsparse_handle handle, rocsparse_operation trans, - int nrow, int ncol, int nnz, const float *alpha, - rocsparse_mat_descr descrA, const float *csrValA, - const int *csrRowPtrA, const int *csrColIndA, - const float *x, const float *beta, float *y) -{ - ROCSPARSE_CHECK(rocsparse_scsrmv(handle, trans, nrow, ncol, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y)); -} - -void csrmv(rocsparse_handle handle, rocsparse_operation trans, - int nrow, int ncol, int nnz, const double *alpha, - rocsparse_mat_descr descrA, const double *csrValA, - const int *csrRowPtrA, const int *csrColIndA, - const double *x, const double *beta, double *y) -{ - ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, trans, nrow, ncol, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y)); -} - -template -void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, - rocsparse_handle handle, rocsparse_operation trans, - int nrow, int ncol, int nnz, rocsparse_mat_descr descr, - const ValueType *alpha, const ValueType *csrValA, - const int *csrRowPtrA, const int *csrColIndA, - const ValueType *x, const ValueType *beta, ValueType *y) -{ - // Warm up - for (int i=0; i<10; ++i) - { - csrmv(handle, rocsparse_operation_none, - nrow, ncol, nnz, alpha, descr, csrValA, - csrRowPtrA, csrColIndA, x, beta, y); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _:state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i=0; i >(end-start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations()*batch* - (sizeof(ValueType)*(2*nrow+nnz)+sizeof(int)*(nrow+1+nnz))); - state.SetItemsProcessed(state.iterations()*batch*2*nnz); -} - -int main(int argc, char *argv[]) -{ - if (argc < 2) - { - fprintf(stderr, "%s [ ]\n", argv[0]); - return -1; - } - - int trials = 200; - int batch_size = 1; - - // Parse command line - if (argc > 2) - { - trials = atoi(argv[2]); - } - if (argc > 3) - { - batch_size = atoi(argv[3]); - } - - // rocSPARSE handle - rocsparse_handle handle; - ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); - - benchmark::Initialize(&argc, argv); - - hipStream_t stream = 0; - hipDeviceProp_t devProp; - int device_id = 0; - - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - printf("[HIP] Device name: %s\n", devProp.name); - - // Read matrix from file - int nrow; - int ncol; - int nnz; - - int *coo_row = NULL; - int *coo_col = NULL; - double *coo_val = NULL; - - if (readMatrixFromMTX(argv[1], nrow, ncol, nnz, - &coo_row, &coo_col, &coo_val) != 0) - { - fprintf(stderr, "Cannot read MTX file %s\n", argv[1]); - return -1; - } - printf("[MTX] %d x %d matrix with %d nnz\n", nrow, ncol, nnz); - - // Convert to CSR (host) TODO - int *Aptr = NULL; - int *Acol = NULL; - float *Avalf = NULL; - double *Avald = NULL; - - coo_to_csr(nrow, ncol, nnz, coo_row, coo_col, coo_val, - &Aptr, &Acol, &Avald); - - Avalf = (float*) malloc(sizeof(float)*nnz); - for (int i=0; i benchmarks = - { - benchmark::RegisterBenchmark("rocsparse_scsrmv", run_benchmark, - stream, batch_size, - handle, rocsparse_operation_none, - nrow, nrow, nnz, descrA, &alphaf, dAvalf, - dAptr, dAcol, dxf, &betaf, dyf), - benchmark::RegisterBenchmark("rocsparse_dcsrmv", run_benchmark, - stream, batch_size, - handle, rocsparse_operation_none, - nrow, nrow, nnz, descrA, &alphad, dAvald, - dAptr, dAcol, dxd, &betad, dyd) - }; - - for (auto& b:benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - b->Iterations(trials); - } - - benchmark::RunSpecifiedBenchmarks(); - - // Clear up on device - HIP_CHECK(hipFree(dAptr)); - HIP_CHECK(hipFree(dAcol)); - HIP_CHECK(hipFree(dAvalf)); - HIP_CHECK(hipFree(dAvald)); - HIP_CHECK(hipFree(dxf)); - HIP_CHECK(hipFree(dxd)); - HIP_CHECK(hipFree(dyf)); - HIP_CHECK(hipFree(dyd)); - - ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); - ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); - - return 0; -} diff --git a/benchmark/benchmark_spmv.cpp b/benchmark/benchmark_spmv.cpp deleted file mode 100644 index 9c7d0d59..00000000 --- a/benchmark/benchmark_spmv.cpp +++ /dev/null @@ -1,232 +0,0 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#include "benchmark/benchmark.h" -#include "utils.h" - -#include -#include - -#define HIP_CHECK(stat) \ -{ \ - hipError_t err = stat; \ - if (err != hipSuccess) \ - { \ - fprintf(stderr, "HIP error: %d line: %d\n", err, __LINE__); \ - exit(stat); \ - } \ -} - -#define ROCSPARSE_CHECK(stat) \ -{ \ - rocsparse_status err = stat; \ - if (err != rocsparse_status_success) \ - { \ - fprintf(stderr, "ROCSPARSE error: %d line: %d\n", err, __LINE__); \ - exit(stat); \ - } \ -} - -void csrmv(rocsparse_handle handle, rocsparse_operation trans, - int nrow, int ncol, int nnz, const float *alpha, - rocsparse_mat_descr descrA, const float *csrValA, - const int *csrRowPtrA, const int *csrColIndA, - const float *x, const float *beta, float *y) -{ - ROCSPARSE_CHECK(rocsparse_scsrmv(handle, trans, nrow, ncol, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y)); -} - -void csrmv(rocsparse_handle handle, rocsparse_operation trans, - int nrow, int ncol, int nnz, const double *alpha, - rocsparse_mat_descr descrA, const double *csrValA, - const int *csrRowPtrA, const int *csrColIndA, - const double *x, const double *beta, double *y) -{ - ROCSPARSE_CHECK(rocsparse_dcsrmv(handle, trans, nrow, ncol, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y)); -} - -template -void run_benchmark(benchmark::State &state, const hipStream_t stream, int batch, - rocsparse_handle handle, rocsparse_operation trans, - int nrow, int ncol, int nnz, rocsparse_mat_descr descr, - const ValueType *alpha, const ValueType *csrValA, - const int *csrRowPtrA, const int *csrColIndA, - const ValueType *x, const ValueType *beta, ValueType *y) -{ - // Warm up - for (int i=0; i<10; ++i) - { - csrmv(handle, rocsparse_operation_none, - nrow, ncol, nnz, alpha, descr, csrValA, - csrRowPtrA, csrColIndA, x, beta, y); - } - HIP_CHECK(hipDeviceSynchronize()); - - for (auto _:state) - { - auto start = std::chrono::high_resolution_clock::now(); - - for (size_t i=0; i >(end-start); - state.SetIterationTime(elapsed_seconds.count()); - } - state.SetBytesProcessed(state.iterations()*batch* - (sizeof(ValueType)*(2*nrow+nnz)+sizeof(int)*(nrow+1+nnz))); - state.SetItemsProcessed(state.iterations()*batch*2*nnz); -} - -int main(int argc, char *argv[]) -{ - int ndim = 2000; - int trials = 200; - int batch_size = 1; - - // Parse command line - if (argc > 1) - { - ndim = atoi(argv[1]); - } - if (argc > 2) - { - trials = atoi(argv[2]); - } - if (argc > 3) - { - batch_size = atoi(argv[3]); - } - - // rocSPARSE handle - rocsparse_handle handle; - ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); - - benchmark::Initialize(&argc, argv); - - hipStream_t stream = 0; - hipDeviceProp_t devProp; - int device_id = 0; - - HIP_CHECK(hipGetDevice(&device_id)); - HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); - printf("[HIP] Device name: %s\n", devProp.name); - - // Generate problem - int *Aptr = NULL; - int *Acol = NULL; - float *Avalf = NULL; - double *Avald = NULL; - int nrow = gen2DLaplacianUS(ndim, &Aptr, &Acol, &Avald); - int nnz = Aptr[nrow]; - - Avalf = (float*) malloc(sizeof(float)*nnz); - for (int i=0; i benchmarks = - { - benchmark::RegisterBenchmark("rocsparse_scsrmv", run_benchmark, - stream, batch_size, - handle, rocsparse_operation_none, - nrow, nrow, nnz, descrA, &alphaf, dAvalf, - dAptr, dAcol, dxf, &betaf, dyf), - benchmark::RegisterBenchmark("rocsparse_dcsrmv", run_benchmark, - stream, batch_size, - handle, rocsparse_operation_none, - nrow, nrow, nnz, descrA, &alphad, dAvald, - dAptr, dAcol, dxd, &betad, dyd) - }; - - for (auto& b:benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - b->Iterations(trials); - } - - benchmark::RunSpecifiedBenchmarks(); - - // Clear up on device - HIP_CHECK(hipFree(dAptr)); - HIP_CHECK(hipFree(dAcol)); - HIP_CHECK(hipFree(dAvalf)); - HIP_CHECK(hipFree(dAvald)); - HIP_CHECK(hipFree(dxf)); - HIP_CHECK(hipFree(dxd)); - HIP_CHECK(hipFree(dyf)); - HIP_CHECK(hipFree(dyd)); - - ROCSPARSE_CHECK(rocsparse_destroy_mat_descr(descrA)); - ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); - - return 0; -} diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt new file mode 100644 index 00000000..7f225779 --- /dev/null +++ b/clients/CMakeLists.txt @@ -0,0 +1,49 @@ +# ######################################################################## +# Copyright 2018 Advanced Micro Devices, Inc. +# ######################################################################## + +# The ROCm platform requires Ubuntu 16.04 or Fedora 24, which has cmake 3.5 +cmake_minimum_required( VERSION 3.5 ) + +# Consider removing this in the future +# This should appear before the project command, because it does not use FORCE +if(WIN32) + set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories") +else() + set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") +endif() + +# This has to be initialized before the project() command appears +# Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE +if(NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.") +endif() + +# This project may compile dependencies for clients +project(rocsparse-clients LANGUAGES CXX) + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) + +# This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on all the time +# This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +if(NOT TARGET rocsparse) + find_package(rocsparse REQUIRED CONFIG PATHS /opt/rocm/rocsparse) +endif() + +# Hip headers required of all clients; clients use hip to allocate device memory +find_package(hip REQUIRED CONFIG PATHS /opt/rocm) + +if(BUILD_CLIENTS_SAMPLES) + add_subdirectory(samples) +endif() + +if(BUILD_CLIENTS_BENCHMARKS) + add_subdirectory(benchmarks) +endif() + +if(BUILD_CLIENTS_TESTS) + enable_testing() + add_subdirectory(tests) +endif() diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt new file mode 100644 index 00000000..84997d2a --- /dev/null +++ b/clients/benchmarks/CMakeLists.txt @@ -0,0 +1,56 @@ +# ######################################################################## +# Copyright 2018 Advanced Micro Devices, Inc. +# ######################################################################## + +set(Boost_USE_MULTITHREADED ON) +set(Boost_DETAILED_FAILURE_MSG ON) +set(Boost_ADDITIONAL_VERSIONS 1.65.1 1.65) +set(Boost_USE_STATIC_LIBS OFF) + +find_package(Boost COMPONENTS program_options) + +if(NOT Boost_FOUND) + message(STATUS "Dynamic boost libraries not found. Attempting to find static libraries") + set(Boost_USE_STATIC_LIBS ON) + find_package(Boost COMPONENTS program_options) + + if(NOT Boost_FOUND) + message(FATAL_ERROR "boost is a required dependency and is not found; try adding boost path to CMAKE_PREFIX_PATH") + endif() +endif() + +set(ROCSPARSE_CLIENTS_COMMON + ../common/arg_check.cpp + ../common/unit.cpp + ../common/utility.cpp + ../common/rocsparse_template_specialization.cpp +) + +add_executable(rocsparse-bench client.cpp ${ROCSPARSE_CLIENTS_COMMON}) + +target_include_directories(rocsparse-bench + PRIVATE + $ +) + +target_include_directories(rocsparse-bench + SYSTEM + PRIVATE + $ + $ +) + +target_link_libraries(rocsparse-bench + PRIVATE + ${Boost_LIBRARIES} + roc::rocsparse + hip::hip_hcc + hip::hip_device +) + +foreach(amdgpu_target ${AMDGPU_TARGETS}) + target_link_libraries(rocsparse-bench + PRIVATE + --amdgpu-target=${amdgpu_target} + ) +endforeach() diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp new file mode 100644 index 00000000..52db4c85 --- /dev/null +++ b/clients/benchmarks/client.cpp @@ -0,0 +1,120 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "utility.hpp" +#include "rocsparse.hpp" +#include "testing_axpyi.hpp" + +#include +#include +#include +#include + +namespace po = boost::program_options; + +int main(int argc, char *argv[]) +{ + Arguments argus; + argus.unit_check = 0; + argus.timing = 1; + + std::string function; + char precision; + + rocsparse_int device_id; + + po::options_description desc("rocsparse client command line options"); + desc.add_options()("help,h", "produces this help message") + ("sizen,n", + po::value(&argus.N)->default_value(128), + "Specific vector size testing, LEVEL-1: the length of the dense vector.") + + ("sizennz,z", + po::value(&argus.nnz)->default_value(32), + "Specific vector size testing, LEVEL-1: the number of non-zero elements " + "of the sparse vector.") + + ("alpha", + po::value(&argus.alpha)->default_value(1.0), "specifies the scalar alpha") + + ("function,f", + po::value(&function)->default_value("axpyi"), + "SPARSE function to test. Options: axpyi") + + ("precision,r", + po::value(&precision)->default_value('s'), "Options: s,d") + + ("verify,v", + po::value(&argus.norm_check)->default_value(0), + "Validate GPU results with CPU? 0 = No, 1 = Yes (default: No)") + + ("iters,i", + po::value(&argus.iters)->default_value(10), + "Iterations to run inside timing loop") + + ("device", + po::value(&device_id)->default_value(0), + "Set default device to be used for subsequent program runs"); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) + { + std::cout << desc << std::endl; + return 0; + } + + if (precision != 's' && precision != 'd') + { + fprintf(stderr, "Invalid value for --precision\n"); + return -1; + } + + if (argus.nnz >= argus.N) + { + fprintf(stderr, "Number of non-zeros should be less than vector length\n"); + return -1; + } + + // Device Query + rocsparse_int device_count = query_device_property(); + + if(device_count <= device_id) + { + fprintf(stderr, "Error: invalid device ID. There may not be such device ID. Will exit\n"); + return -1; + } + else + { + set_device(device_id); + } + + /* ============================================================================================ + */ +// if(argus.M < 0 || argus.N < 0 || argus.K < 0) +// { +// fprintf(stderr, "Invalid matrix dimension\n"); +// } + if (argus.N < 0) + { + fprintf(stderr, "Invalid dimension\n"); + return -1; + } + + if(function == "axpyi") + { + if(precision == 's') + testing_axpyi(argus); + else if(precision == 'd') + testing_axpyi(argus); + } + else + { + fprintf(stderr, "Invalid value for --function\n"); + return -1; + } + return 0; +} diff --git a/clients/common/arg_check.cpp b/clients/common/arg_check.cpp new file mode 100644 index 00000000..c12931ef --- /dev/null +++ b/clients/common/arg_check.cpp @@ -0,0 +1,79 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "arg_check.hpp" + +#include +#include +#include + +#ifdef GOOGLE_TEST +#include +#endif + +#define PRINT_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if(TMP_STATUS_FOR_CHECK != hipSuccess) \ + { \ + fprintf(stderr, \ + "hip error code: %d at %s:%d\n", \ + TMP_STATUS_FOR_CHECK, \ + __FILE__, \ + __LINE__); \ + } \ + } + +void verify_rocsparse_status_invalid_pointer(rocsparse_status status, + const char* message) +{ +#ifdef GOOGLE_TEST + ASSERT_EQ(status, rocsparse_status_invalid_pointer); +#else + if(status != rocsparse_status_invalid_pointer) + { + std::cerr << "rocSPARSE TEST ERROR: status != rocsparse_status_invalid_pointer, "; + std::cerr << message << std::endl; + } +#endif +} + +void verify_rocsparse_status_invalid_size(rocsparse_status status, + const char* message) +{ +#ifdef GOOGLE_TEST + ASSERT_EQ(status, rocsparse_status_invalid_size); +#else + if(status != rocsparse_status_invalid_size) + { + std::cerr << "rocSPARSE TEST ERROR: status != rocsparse_status_invalid_size, "; + std::cerr << message << std::endl; + } +#endif +} + +void verify_rocsparse_status_invalid_handle(rocsparse_status status) +{ +#ifdef GOOGLE_TEST + ASSERT_EQ(status, rocsparse_status_invalid_handle); +#else + if(status != rocsparse_status_invalid_handle) + { + std::cerr << "rocSPARSE TEST ERROR: status != rocsparse_status_invalid_handle" << std::endl; + } +#endif +} + +void verify_rocsparse_status_success(rocsparse_status status, const char* message) +{ +#ifdef GOOGLE_TEST + ASSERT_EQ(status, rocsparse_status_success); +#else + if(status != rocsparse_status_success) + { + std::cerr << "rocSPARSE TEST ERROR: status != rocsparse_status_success, "; + std::cerr << message << std::endl; + } +#endif +} diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp new file mode 100644 index 00000000..68c22eb6 --- /dev/null +++ b/clients/common/rocsparse_template_specialization.cpp @@ -0,0 +1,35 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.hpp" + +#include + +namespace rocsparse { + +template <> +rocsparse_status rocsparse_axpyi(rocsparse_handle handle, + rocsparse_int nnz, + const float *alpha, + const float *xVal, + const rocsparse_int *xInd, + float *y, + rocsparse_index_base idxBase) +{ + return rocsparse_saxpyi(handle, nnz, alpha, xVal, xInd, y, idxBase); +} + +template <> +rocsparse_status rocsparse_axpyi(rocsparse_handle handle, + rocsparse_int nnz, + const double *alpha, + const double *xVal, + const rocsparse_int *xInd, + double *y, + rocsparse_index_base idxBase) +{ + return rocsparse_daxpyi(handle, nnz, alpha, xVal, xInd, y, idxBase); +} + +} // namespace rocsparse diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp new file mode 100644 index 00000000..73ceb497 --- /dev/null +++ b/clients/common/unit.cpp @@ -0,0 +1,69 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "unit.hpp" + +#include +#include + +#ifdef GOOGLE_TEST +#include +#endif + +/* ========================================Gtest Unit Check + * ==================================================== */ + +/*! \brief Template: gtest unit compare two matrices float/double/complex */ +// Do not put a wrapper over ASSERT_FLOAT_EQ, sincer assert exit the current function NOT the test +// case +// a wrapper will cause the loop keep going + +template <> +void unit_check_general(rocsparse_int M, rocsparse_int N, float* hCPU, float* hGPU) +{ +#pragma unroll + for(rocsparse_int j = 0; j < N; j++) + { +#pragma unroll + for(rocsparse_int i = 0; i < M; i++) + { +#ifdef GOOGLE_TEST + ASSERT_FLOAT_EQ(hCPU[i + j], hGPU[i + j]); +#endif + } + } +} + +template <> +void unit_check_general(rocsparse_int M, rocsparse_int N, double* hCPU, double* hGPU) +{ +#pragma unroll + for(rocsparse_int j = 0; j < N; j++) + { +#pragma unroll + for(rocsparse_int i = 0; i < M; i++) + { +#ifdef GOOGLE_TEST + ASSERT_DOUBLE_EQ(hCPU[i + j], hGPU[i + j]); +#endif + } + } +} + +template <> +void unit_check_general( + rocsparse_int M, rocsparse_int N, rocsparse_int* hCPU, rocsparse_int* hGPU) +{ +#pragma unroll + for(rocsparse_int j = 0; j < N; j++) + { +#pragma unroll + for(rocsparse_int i = 0; i < M; i++) + { +#ifdef GOOGLE_TEST + ASSERT_EQ(hCPU[i + j], hGPU[i + j]); +#endif + } + } +} diff --git a/clients/common/utility.cpp b/clients/common/utility.cpp new file mode 100644 index 00000000..1e49ff43 --- /dev/null +++ b/clients/common/utility.cpp @@ -0,0 +1,96 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "utility.hpp" + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* ============================================================================================ */ +/* device query and print out their ID and name; return number of compute-capable devices. */ +rocsparse_int query_device_property() +{ + int device_count; + rocsparse_status status = (rocsparse_status)hipGetDeviceCount(&device_count); + if(status != rocsparse_status_success) + { + printf("Query device error: cannot get device count\n"); + return -1; + } + else + { + printf("Query device success: there are %d devices\n", device_count); + } + + for(rocsparse_int i = 0; i < device_count; i++) + { + hipDeviceProp_t props; + rocsparse_status status = (rocsparse_status)hipGetDeviceProperties(&props, i); + if(status != rocsparse_status_success) + { + printf("Query device error: cannot get device ID %d's property\n", i); + } + else + { + printf("Device ID %d : %s\n", i, props.name); + printf("-------------------------------------------------------------------------\n"); + printf("with %ldMB memory, clock rate %dMHz @ computing capability %d.%d \n", + props.totalGlobalMem >> 20, + (int)(props.clockRate / 1000), + props.major, + props.minor); + printf( + "maxGridDimX %d, sharedMemPerBlock %ldKB, maxThreadsPerBlock %d, warpSize %d\n", + props.maxGridSize[0], + props.sharedMemPerBlock >> 10, + props.maxThreadsPerBlock, + props.warpSize); + + printf("-------------------------------------------------------------------------\n"); + } + } + + return device_count; +} + +/* set current device to device_id */ +void set_device(rocsparse_int device_id) +{ + rocsparse_status status = (rocsparse_status)hipSetDevice(device_id); + if(status != rocsparse_status_success) + { + printf("Set device error: cannot set device ID %d, there may not be such device ID\n", + (int)device_id); + } +} +/* ============================================================================================ */ +/* timing:*/ + +/*! \brief CPU Timer(in microsecond): synchronize with the default device and return wall time */ +double get_time_us(void) +{ + hipDeviceSynchronize(); + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000 * 1000) + tv.tv_usec; +}; + +/*! \brief CPU Timer(in microsecond): synchronize with given queue/stream and return wall time */ +double get_time_us_sync(hipStream_t stream) +{ + hipStreamSynchronize(stream); + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000 * 1000) + tv.tv_usec; +}; + +#ifdef __cplusplus +} +#endif diff --git a/clients/include/arg_check.hpp b/clients/include/arg_check.hpp new file mode 100644 index 00000000..3696690c --- /dev/null +++ b/clients/include/arg_check.hpp @@ -0,0 +1,23 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +#pragma once +#ifndef ARG_CHECK_HPP +#define ARG_CHECK_HPP + +#include + +void verify_rocsparse_status_invalid_pointer(rocsparse_status status, + const char* message); + +void verify_rocsparse_status_invalid_size(rocsparse_status status, + const char* message); + +void verify_rocsparse_status_invalid_handle(rocsparse_status status); + +void verify_rocsparse_status_success(rocsparse_status status, + const char* message); + +#endif // ARG_CHECK_HPP diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp new file mode 100644 index 00000000..abe34782 --- /dev/null +++ b/clients/include/rocsparse.hpp @@ -0,0 +1,24 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef _ROCSPARSE_HPP_ +#define _ROCSPARSE_HPP_ + +#include + +namespace rocsparse { + +template +rocsparse_status rocsparse_axpyi(rocsparse_handle handle, + rocsparse_int nnz, + const T *alpha, + const T *xVal, + const rocsparse_int *xInd, + T *y, + rocsparse_index_base idxBase); + +} + +#endif // _ROCSPARSE_HPP_ diff --git a/clients/include/rocsparse_test_unique_ptr.hpp b/clients/include/rocsparse_test_unique_ptr.hpp new file mode 100644 index 00000000..82225838 --- /dev/null +++ b/clients/include/rocsparse_test_unique_ptr.hpp @@ -0,0 +1,64 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +#pragma once +#ifndef GUARD_ROCSPARSE_MANAGE_PTR +#define GUARD_ROCSPARSE_MANAGE_PTR + +#include "arg_check.hpp" + +#include +#include +#include + +#define PRINT_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if(TMP_STATUS_FOR_CHECK != hipSuccess) \ + { \ + fprintf(stderr, \ + "hip error code: %d at %s:%d\n", \ + TMP_STATUS_FOR_CHECK, \ + __FILE__, \ + __LINE__); \ + } \ + } + +namespace rocsparse_test { + +// device_malloc wraps hipMalloc and provides same API as malloc +static void* device_malloc(size_t byte_size) +{ + void* pointer; + PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size)); + return pointer; +} + +// device_free wraps hipFree and provides same API as free +static void device_free(void* ptr) { + PRINT_IF_HIP_ERROR(hipFree(ptr)); +} + +struct handle_struct +{ + rocsparse_handle handle; + handle_struct() + { + rocsparse_status status = rocsparse_create_handle(&handle); + verify_rocsparse_status_success(status, "ERROR: handle_struct constructor"); + } + + ~handle_struct() + { + rocsparse_status status = rocsparse_destroy_handle(handle); + verify_rocsparse_status_success(status, "ERROR: handle_struct destructor"); + } +}; + +} // namespace rocsparse_test + +using rocsparse_unique_ptr = std::unique_ptr; + +#endif // GUARD_ROCSPARSE_MANAGE_PTR diff --git a/clients/include/testing_axpyi.hpp b/clients/include/testing_axpyi.hpp new file mode 100644 index 00000000..ed52e788 --- /dev/null +++ b/clients/include/testing_axpyi.hpp @@ -0,0 +1,246 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_AXPYI_HPP +#define TESTING_AXPYI_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include + +typedef rocsparse_index_base base; + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_axpyi_bad_arg(void) +{ + I nnz = 100; + I safe_size = 100; + T alpha = 0.6; + base idxBase = rocsparse_index_base_zero; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), + device_free}; + auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), + device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), + device_free}; + + T *dxVal = (T*) dxVal_managed.get(); + I *dxInd = (I*) dxInd_managed.get(); + T *dy = (T*) dy_managed.get(); + + if(!dxInd || !dxVal || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for (nullptr == dxInd) + { + I *dxInd_null = nullptr; + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd_null, dy, idxBase); + verify_rocsparse_status_invalid_pointer(status, "Error: xInd is nullptr"); + } + // testing for (nullptr == dxVal) + { + T *dxVal_null = nullptr; + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal_null, dxInd, dy, idxBase); + verify_rocsparse_status_invalid_pointer(status, "Error: xVal is nullptr"); + } + // testing for (nullptr == dy) + { + T *dy_null = nullptr; + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd, dy_null, idxBase); + verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); + } + // testing for (nullptr == d_alpha) + { + T *d_alpha_null = nullptr; + status = rocsparse_axpyi(handle, nnz, d_alpha_null, dxVal, dxInd, dy, idxBase); + verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); + } + // testing for (nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + status = rocsparse_axpyi(handle_null, nnz, &alpha, dxVal, dxInd, dy, idxBase); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_axpyi(Arguments argus) +{ + I N = argus.N; + I nnz = argus.nnz; + I safe_size = 100; + T h_alpha = argus.alpha; + rocsparse_index_base idxBase = argus.idxBase; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + // Argument sanity check before allocating invalid memory + if(nnz <= 0) + { + auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I) * safe_size), + device_free}; + auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), + device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), + device_free}; + + I *dxInd = (I*) dxInd_managed.get(); + T *dxVal = (T*) dxVal_managed.get(); + T *dy = (T*) dy_managed.get(); + + if(!dxInd || !dxVal || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dxInd || !dxVal || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy, idxBase); + + if (nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "nnz == 0"); + } + + return rocsparse_status_success; + } + + // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice + std::vector hxInd(nnz); + std::vector hxVal(nnz); + std::vector hy_1(N); + std::vector hy_2(N); + std::vector hy_gold(N); + + // Initial Data on CPU + srand(12345ULL); + rocsparse_init_index(hxInd, nnz, 1, N); + rocsparse_init(hxVal, 1, nnz); + rocsparse_init(hy_1, 1, N); + + // copy vector is easy in STL; hy_gold = hx: save a copy in hy_gold which will be output of CPU + hy_2 = hy_1; + hy_gold = hy_1; + + // allocate memory on device + auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*nnz), + device_free}; + auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nnz), + device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*N), + device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*N), + device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), + device_free}; + + I *dxInd = (I*) dxInd_managed.get(); + T *dxVal = (T*) dxVal_managed.get(); + T *dy_1 = (T*) dy_1_managed.get(); + T *dy_2 = (T*) dy_2_managed.get(); + T *d_alpha = (T*) d_alpha_managed.get(); + + if(!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy(dxInd, hxInd.data(), sizeof(I) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dxVal, hxVal.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + + double gpu_time_used, cpu_time_used; + double rocsparse_gflops, cpu_gflops, rocsparse_bandwidth; + + if(argus.unit_check) + { + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idxBase)); + + // ROCSPARSE pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idxBase)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * N, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * N, hipMemcpyDeviceToHost)); + + // CPU + cpu_time_used = get_time_us(); + + for (int i=0; i + +typedef rocsparse_index_base base; + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_csrmv_bad_arg(void) +{ + I nnz = 100; + I safe_size = 100; + T alpha = 0.6; + base idxBase = rocsparse_index_base_zero; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), + device_free}; + auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), + device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), + device_free}; + + T *dxVal = (T*) dxVal_managed.get(); + I *dxInd = (I*) dxInd_managed.get(); + T *dy = (T*) dy_managed.get(); + + if(!dxInd || !dxVal || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for (nullptr == dxInd) + { + I *dxInd_null = nullptr; + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd_null, dy, idxBase); + verify_rocsparse_status_invalid_pointer(status, "Error: xInd is nullptr"); + } + // testing for (nullptr == dxVal) + { + T *dxVal_null = nullptr; + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal_null, dxInd, dy, idxBase); + verify_rocsparse_status_invalid_pointer(status, "Error: xVal is nullptr"); + } + // testing for (nullptr == dy) + { + T *dy_null = nullptr; + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd, dy_null, idxBase); + verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); + } + // testing for (nullptr == d_alpha) + { + T *d_alpha_null = nullptr; + status = rocsparse_axpyi(handle, nnz, d_alpha_null, dxVal, dxInd, dy, idxBase); + verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); + } + // testing for (nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + status = rocsparse_axpyi(handle_null, nnz, &alpha, dxVal, dxInd, dy, idxBase); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_csrmv(Arguments argus) +{ + I N = argus.N; + I nnz = argus.nnz; + I safe_size = 100; + T h_alpha = argus.alpha; + rocsparse_index_base idxBase = argus.idxBase; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + // Argument sanity check before allocating invalid memory + if(nnz <= 0) + { + auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I) * safe_size), + device_free}; + auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), + device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), + device_free}; + + I *dxInd = (I*) dxInd_managed.get(); + T *dxVal = (T*) dxVal_managed.get(); + T *dy = (T*) dy_managed.get(); + + if(!dxInd || !dxVal || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dxInd || !dxVal || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy, idxBase); + + if (nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "nnz == 0"); + } + + return rocsparse_status_success; + } + + // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice + std::vector hxInd(nnz); + std::vector hxVal(nnz); + std::vector hy_1(N); + std::vector hy_2(N); + std::vector hy_gold(N); + + // Initial Data on CPU + srand(12345ULL); + rocsparse_init_index(hxInd, nnz, 1, N); + rocsparse_init(hxVal, 1, nnz); + rocsparse_init(hy_1, 1, N); + + // copy vector is easy in STL; hy_gold = hx: save a copy in hy_gold which will be output of CPU + hy_2 = hy_1; + hy_gold = hy_1; + + // allocate memory on device + auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*nnz), + device_free}; + auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nnz), + device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*N), + device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*N), + device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), + device_free}; + + I *dxInd = (I*) dxInd_managed.get(); + T *dxVal = (T*) dxVal_managed.get(); + T *dy_1 = (T*) dy_1_managed.get(); + T *dy_2 = (T*) dy_2_managed.get(); + T *d_alpha = (T*) d_alpha_managed.get(); + + if(!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy(dxInd, hxInd.data(), sizeof(I) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dxVal, hxVal.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + + double gpu_time_used, cpu_time_used; + double rocsparse_gflops, cpu_gflops, rocsparse_bandwidth; + + if(argus.unit_check) + { + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idxBase)); + + // ROCSPARSE pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idxBase)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * N, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * N, hipMemcpyDeviceToHost)); + + // CPU + cpu_time_used = get_time_us(); + + for (int i=0; i + +/* ===================================================================== + + Google Unit check: ASSERT_EQ( elementof(A), elementof(B)) + + =================================================================== */ + +/*!\file + * \brief compares two results (usually, CPU and GPU results); provides Google Unit check. + */ + +/* ========================================Gtest Unit Check + * ==================================================== */ + +/*! \brief Template: gtest unit compare two matrices float/double/complex */ +// Do not put a wrapper over ASSERT_FLOAT_EQ, sincer assert exit the current function NOT the test +// case +// a wrapper will cause the loop keep going +template +void unit_check_general(rocsparse_int M, rocsparse_int N, T* hCPU, T* hGPU); + +#endif // UNIT_HPP diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp new file mode 100644 index 00000000..859765ce --- /dev/null +++ b/clients/include/utility.hpp @@ -0,0 +1,182 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_UTILITY_HPP +#define TESTING_UTILITY_HPP + +#include +#include +#include +#include +#include + +/*!\file + * \brief provide data initialization and timing utilities. + */ + +#define CHECK_HIP_ERROR(error) \ + if(error != hipSuccess) \ + { \ + fprintf(stderr, \ + "error: '%s'(%d) at %s:%d\n", \ + hipGetErrorString(error), \ + error, \ + __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } + +#define CHECK_ROCSPARSE_ERROR(error) \ + if(error != rocsparse_status_success) \ + { \ + fprintf(stderr, "rocSPARSE error: "); \ + if(error == rocsparse_status_invalid_handle) \ + { \ + fprintf(stderr, "rocsparse_status_invalid_handle"); \ + } \ + else if(error == rocsparse_status_not_implemented) \ + { \ + fprintf(stderr, " rocsparse_status_not_implemented"); \ + } \ + else if(error == rocsparse_status_invalid_pointer) \ + { \ + fprintf(stderr, "rocsparse_status_invalid_pointer"); \ + } \ + else if(error == rocsparse_status_invalid_size) \ + { \ + fprintf(stderr, "rocsparse_status_invalid_size"); \ + } \ + else if(error == rocsparse_status_memory_error) \ + { \ + fprintf(stderr, "rocsparse_status_memory_error"); \ + } \ + else if(error == rocsparse_status_internal_error) \ + { \ + fprintf(stderr, "rocsparse_status_internal_error"); \ + } \ + else \ + { \ + fprintf(stderr, "rocsparse_status error"); \ + } \ + fprintf(stderr, "\n"); \ + return error; \ + } + +/* ============================================================================================ */ +/* generate random number :*/ + +/*! \brief generate a random number between [0, 0.999...] . */ +template +T random_generator() +{ + // return rand()/( (T)RAND_MAX + 1); + return (T)(rand() % 10 + 1); // generate a integer number between [1, 10] +}; + +/* ============================================================================================ */ +/*! \brief matrix/vector initialization: */ +// for vector x (M=1, N=lengthX); +// for complex number, the real/imag part would be initialized with the same value +template +void rocsparse_init(std::vector& A, rocsparse_int M, rocsparse_int N) +{ + for(rocsparse_int i = 0; i < M; ++i) + { + for(rocsparse_int j = 0; j < N; ++j) + { + A[i + j] = random_generator(); + } + } +}; + +/* ============================================================================================ */ +/*! \brief vector initialization: */ +// initialize sparse index vector with nnz entries ranging from start to end +template +void rocsparse_init_index(std::vector &x, rocsparse_int nnz, + rocsparse_int start, rocsparse_int end) +{ + std::vector check(end-start, false); + int num = 0; + while (num < nnz) + { + rocsparse_int val = start + rand() % (end-start); + if (!check[val-start]) + { + x[num] = val; + check[val-start] = true; + ++num; + } + } + std::sort(x.begin(), x.end()); +}; + +#ifdef __cplusplus +extern "C" { +#endif + +/* ============================================================================================ */ +/* device query and print out their ID and name */ +rocsparse_int query_device_property(); + +/* set current device to device_id */ +void set_device(rocsparse_int device_id); + +/* ============================================================================================ */ +/* timing: HIP only provides very limited timers function clock() and not general; + rocsparse sync CPU and device and use more accurate CPU timer*/ + +/*! \brief CPU Timer(in microsecond): synchronize with the default device and return wall time */ +double get_time_us(void); + +/*! \brief CPU Timer(in microsecond): synchronize with given queue/stream and return wall time */ +double get_time_us_sync(hipStream_t stream); + +#ifdef __cplusplus +} +#endif + +/* ============================================================================================ */ + +/*! \brief Class used to parse command arguments in both client & gtest */ + +// has to compile with option "-std=c++11", and this rocsparse library uses c++11 everywhere +// c++11 allows intilization of member of a struct + +class Arguments +{ + public: + + rocsparse_int N = 128; + rocsparse_int nnz = 32; + + double alpha = 1.0; + + rocsparse_index_base idxBase = rocsparse_index_base_zero; + + rocsparse_int norm_check = 0; + rocsparse_int unit_check = 1; + rocsparse_int timing = 0; + + rocsparse_int iters = 10; + + Arguments& operator=(const Arguments& rhs) + { + N = rhs.N; + nnz = rhs.nnz; + + alpha = rhs.alpha; + + idxBase = rhs.idxBase; + + norm_check = rhs.norm_check; + unit_check = rhs.unit_check; + timing = rhs.timing; + + return *this; + } +}; + +#endif // TESTING_UTILITY_HPP diff --git a/example/CMakeLists.txt b/clients/samples/CMakeLists.txt similarity index 62% rename from example/CMakeLists.txt rename to clients/samples/CMakeLists.txt index 2c791398..4ab7e568 100644 --- a/example/CMakeLists.txt +++ b/clients/samples/CMakeLists.txt @@ -1,24 +1,22 @@ # ######################################################################## -# Copyright 2016 Advanced Micro Devices, Inc. +# Copyright 2018 Advanced Micro Devices, Inc. # ######################################################################## # Function to add rocsparse examples function(add_rocsparse_example EXAMPLE_SOURCE) get_filename_component(EXAMPLE_TARGET ${EXAMPLE_SOURCE} NAME_WE) add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCE}) - if(HIP_PLATFORM STREQUAL "hcc") + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + rocsparse + hip::hip_hcc + ) + foreach(amdgpu_target ${AMDGPU_TARGETS}) target_link_libraries(${EXAMPLE_TARGET} PRIVATE - rocsparse - hip::hip_hcc + --amdgpu-target=${amdgpu_target} ) - foreach(amdgpu_target ${AMDGPU_TARGETS}) - target_link_libraries(${EXAMPLE_TARGET} - PRIVATE - --amdgpu-target=${amdgpu_target} - ) - endforeach() - endif() + endforeach() set_target_properties(${EXAMPLE_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example" @@ -26,4 +24,4 @@ function(add_rocsparse_example EXAMPLE_SOURCE) endfunction() # Examples -add_rocsparse_example(rocsparse_handle.cpp) +add_rocsparse_example(example_handle.cpp) diff --git a/example/rocsparse_handle.cpp b/clients/samples/example_handle.cpp similarity index 100% rename from example/rocsparse_handle.cpp rename to clients/samples/example_handle.cpp diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt new file mode 100644 index 00000000..aa01457f --- /dev/null +++ b/clients/tests/CMakeLists.txt @@ -0,0 +1,51 @@ +# ######################################################################## +# Copyright 2018 Advanced Micro Devices, Inc. +# ######################################################################## + +find_package(GTest REQUIRED) + +set(ROCSPARSE_TEST_SOURCES + rocsparse_gtest_main.cpp + test_axpyi.cpp + test_csrmv.cpp +) + +set(ROCSPARSE_CLIENTS_COMMON + ../common/arg_check.cpp + ../common/unit.cpp + ../common/utility.cpp + ../common/rocsparse_template_specialization.cpp +) + +add_executable(rocsparse-test ${ROCSPARSE_TEST_SOURCES} ${ROCSPARSE_CLIENTS_COMMON}) + +target_compile_definitions(rocsparse-test PRIVATE GOOGLE_TEST) + +target_include_directories(rocsparse-test + PRIVATE + $ +) + +target_include_directories(rocsparse-test + SYSTEM + PRIVATE + $ + $ +) + +target_link_libraries(rocsparse-test + PRIVATE + roc::rocsparse + ${GTEST_BOTH_LIBRARIES} + hip::hip_hcc + hip::hip_device +) + +foreach(amdgpu_target ${AMDGPU_TARGETS}) + target_link_libraries(rocsparse-test + PRIVATE + --amdgpu-target=${amdgpu_target} + ) +endforeach() + +add_test(rocsparse-test rocsparse-test) diff --git a/clients/tests/rocsparse_gtest_main.cpp b/clients/tests/rocsparse_gtest_main.cpp new file mode 100644 index 00000000..91583232 --- /dev/null +++ b/clients/tests/rocsparse_gtest_main.cpp @@ -0,0 +1,33 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "utility.hpp" + +#include +#include + +/* ===================================================================== + Main function: +=================================================================== */ + +int main(int argc, char** argv) +{ + // Device Query + int device_id = 0; + int device_count = query_device_property(); + + if(device_count <= device_id) + { + fprintf(stderr, "Error: invalid device ID. There may not be such device ID. Will exit\n"); + return -1; + } + else + { + set_device(device_id); + } + + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/clients/tests/test_axpyi.cpp b/clients/tests/test_axpyi.cpp new file mode 100644 index 00000000..f37ef996 --- /dev/null +++ b/clients/tests/test_axpyi.cpp @@ -0,0 +1,64 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_axpyi.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple axpyi_tuple; + +int axpyi_N_range[] = {12000, 15332, 22031}; +int axpyi_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; +std::vector axpyi_alpha_range = {1.0, 0.0}; +base axpyi_idxBase_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; + +class parameterized_axpyi : public testing::TestWithParam +{ + protected: + parameterized_axpyi() {} + virtual ~parameterized_axpyi() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_axpyi_arguments(axpyi_tuple tup) +{ + Arguments arg; + arg.N = std::get<0>(tup); + arg.nnz = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.idxBase = std::get<3>(tup); + arg.timing = 0; + return arg; +} + +TEST(axpyi_bad_arg, axpyi_float) +{ + testing_axpyi_bad_arg(); +} + +TEST_P(parameterized_axpyi, axpyi_float) +{ + Arguments arg = setup_axpyi_arguments(GetParam()); + rocsparse_status status = testing_axpyi(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_axpyi, axpyi_double) +{ + Arguments arg = setup_axpyi_arguments(GetParam()); + rocsparse_status status = testing_axpyi(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(axpyi, parameterized_axpyi, + testing::Combine(testing::ValuesIn(axpyi_N_range), + testing::ValuesIn(axpyi_nnz_range), + testing::ValuesIn(axpyi_alpha_range), + testing::ValuesIn(axpyi_idxBase_range))); diff --git a/clients/tests/test_csrmv.cpp b/clients/tests/test_csrmv.cpp new file mode 100644 index 00000000..f0a39829 --- /dev/null +++ b/clients/tests/test_csrmv.cpp @@ -0,0 +1,63 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_csrmv.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple csrmv_tuple; + +int csr_N_range[] = {12000, 15332, 22031}; +int csr_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; +std::vector csr_alpha_range = {1.0, 0.0}; +base csr_idxBase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +class parameterized_csrmv : public testing::TestWithParam +{ + protected: + parameterized_csrmv() {} + virtual ~parameterized_csrmv() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csrmv_arguments(csrmv_tuple tup) +{ + Arguments arg; + arg.N = std::get<0>(tup); + arg.nnz = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.idxBase = std::get<3>(tup); + arg.timing = 0; + return arg; +} + +TEST(csrmv_bad_arg, csrmv_float) +{ + testing_csrmv_bad_arg(); +} + +TEST_P(parameterized_csrmv, csrmv_float) +{ + Arguments arg = setup_csrmv_arguments(GetParam()); + rocsparse_status status = testing_csrmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrmv, csrmv_double) +{ + Arguments arg = setup_csrmv_arguments(GetParam()); + rocsparse_status status = testing_csrmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csrmv, parameterized_csrmv, + testing::Combine(testing::ValuesIn(csr_N_range), + testing::ValuesIn(csr_nnz_range), + testing::ValuesIn(csr_alpha_range), + testing::ValuesIn(csr_idxBase_range))); diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 86c3e922..d93112e2 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -105,11 +105,11 @@ if(BUILD_BENCHMARK) endif() # rocPRIM package -message(STATUS "Downloading rocPRIM.") -download_project(PROJ rocPRIM - GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git - GIT_TAG master -) +#message(STATUS "Downloading rocPRIM.") +#download_project(PROJ rocPRIM +# GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git +# GIT_TAG master +#) # ROCm package find_package(ROCM QUIET CONFIG PATHS /opt/rocm) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt deleted file mode 100644 index cc6412c2..00000000 --- a/test/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -# ######################################################################## -# Copyright 2016 Advanced Micro Devices, Inc. -# ######################################################################## - -# Function to add rocsparse tests -function(add_rocsparse_test TEST_NAME TEST_SOURCE) - get_filename_component(TEST_TARGET ${TEST_SOURCE} NAME_WE) - add_executable(${TEST_TARGET} ${TEST_SOURCE}) - target_include_directories(${TEST_TARGET} SYSTEM - PUBLIC - ${GTEST_INCLUDE_DIRS} - ${CMAKE_SOURCE_DIR}/utils - ) - if(HIP_PLATFORM STREQUAL "hcc") - target_link_libraries(${TEST_TARGET} - PRIVATE - rocsparse - ${GTEST_BOTH_LIBRARIES} - hip::hip_hcc - hip::hip_device - ) - foreach(amdgpu_target ${AMDGPU_TARGETS}) - target_link_libraries(${TEST_TARGET} - PRIVATE - --amdgpu-target=${amdgpu_target} - ) - endforeach() - endif() - set_target_properties(${TEST_TARGET} - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/test" - ) - add_test(${TEST_NAME} ${TEST_TARGET}) -endfunction() - -# Tests -add_rocsparse_test("device_apis" test_device_apis.cpp) -add_rocsparse_test("rocsparse.handle" test_rocsparse_handle.cpp) -add_rocsparse_test("rocsparse.csrmv" test_rocsparse_csrmv.cpp) diff --git a/test/test_device_apis.cpp b/test/test_device_apis.cpp deleted file mode 100644 index 3bcb8b23..00000000 --- a/test/test_device_apis.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* ************************************************************************ - * Copyright 2016 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#include -#include -#include - -#define HIP_CHECK(x) ASSERT_EQ(x, hipSuccess) - -template -__global__ -void axpy_kernel(const T *x, T *y, T a, size_t size) -{ - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - - if(i < size) - { - y[i] += a * x[i]; - } -} - -TEST(Tests, Saxpy) -{ - size_t N = 100; - - float a = 100.0f; - std::vector x(N, 2.0f); - std::vector y(N, 1.0f); - - float *d_x; - float *d_y; - HIP_CHECK(hipMalloc(&d_x, N*sizeof(float))); - HIP_CHECK(hipMalloc(&d_y, N*sizeof(float))); - HIP_CHECK(hipMemcpy(d_x, x.data(), - N*sizeof(float), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_y, y.data(), - N*sizeof(float), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy_kernel), - dim3((N+255)/256), dim3(256), 0, 0, - d_x, d_y, a, N); - HIP_CHECK(hipPeekAtLastError()); - - HIP_CHECK(hipMemcpy(y.data(), d_y, - N*sizeof(float), - hipMemcpyDeviceToHost)); - HIP_CHECK(hipDeviceSynchronize()); - HIP_CHECK(hipFree(d_x)); - HIP_CHECK(hipFree(d_y)); - - for(size_t i=0; i x(N, 2.0f); - std::vector y(N, 1.0f); - - double *d_x; - double *d_y; - HIP_CHECK(hipMalloc(&d_x, N*sizeof(double))); - HIP_CHECK(hipMalloc(&d_y, N*sizeof(double))); - HIP_CHECK(hipMemcpy(d_x, x.data(), - N*sizeof(double), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_y, y.data(), - N*sizeof(double), - hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceSynchronize()); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy_kernel), - dim3((N+255)/256), dim3(256), 0, 0, - d_x, d_y, a, N); - HIP_CHECK(hipPeekAtLastError()); - - HIP_CHECK(hipMemcpy(y.data(), d_y, - N*sizeof(double), - hipMemcpyDeviceToHost)); - HIP_CHECK(hipDeviceSynchronize()); - HIP_CHECK(hipFree(d_x)); - HIP_CHECK(hipFree(d_y)); - - for(size_t i=0; i -#include -#include -#include - -#define HIP_CHECK(x) ASSERT_EQ(x, hipSuccess) -#define ROCSPARSE_CHECK(x) ASSERT_EQ(x, rocsparse_status_success) - -TEST(Tests, rocsparse_scsrmv) -{ - rocsparse_handle handle; - ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); - - // Generate problem - int *Aptr = NULL; - int *Acol = NULL; - float *Aval = NULL; - int nrow = gen2DLaplacianUS(2000, &Aptr, &Acol, &Aval); - int nnz = Aptr[nrow]; - - // Sample some random data - srand(12345ULL); - - float alpha = (float) rand() / RAND_MAX; - float beta = (float) rand() / RAND_MAX; - - float *x = (float*) malloc(sizeof(float)*nrow); - float *y = (float*) malloc(sizeof(float)*nrow); - for (int i=0; i -#include - -#define ROCSPARSE_CHECK(x) ASSERT_EQ(x, rocsparse_status_success) - -TEST(Tests, handle) -{ - rocsparse_handle handle; - ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); - ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); -} diff --git a/utils/utils.h b/utils/utils.h deleted file mode 100644 index 92fd98ed..00000000 --- a/utils/utils.h +++ /dev/null @@ -1,264 +0,0 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#ifndef ROCSPARSE_UTILS_H_ -#define ROCSPARSE_UTILS_H_ - -#include -#include -#include -#include - -template -inline int gen2DLaplacianUS(int ndim, int **rowptr, int **col, T **val) -{ - - int n = ndim * ndim; - int nnz_mat = n * 5 - ndim * 4; - - *rowptr = (int*) malloc((n+1)*sizeof(int)); - *col = (int*) malloc(nnz_mat*sizeof(int)); - *val = (T*) malloc(nnz_mat*sizeof(T)); - - int nnz = 0; - - // Fill local arrays - for (int i=0; i -inline int readMatrixFromMTX(const char *filename, - int &nrow, int &ncol, int &nnz, - int **row, int **col, T **val) -{ - FILE *f = fopen(filename, "r"); - if (!f) - { - return -1; - } - - char line[1024]; - - // Check for banner - if (!fgets(line, 1024, f)) - { - return -1; - } - - char banner[16]; - char array[16]; - char coord[16]; - char data[16]; - char type[16]; - - // Extract banner - if (sscanf(line, "%s %s %s %s %s", banner, array, coord, data, type) != 5) - { - return -1; - } - - // Convert to lower case - for (char *p=array; *p!='\0'; *p=tolower(*p), p++); - for (char *p=coord; *p!='\0'; *p=tolower(*p), p++); - for (char *p=data; *p!='\0'; *p=tolower(*p), p++); - for (char *p=type; *p!='\0'; *p=tolower(*p), p++); - - // Check banner - if (strncmp(line, "%%MatrixMarket", 14) != 0) - { - return -1; - } - - // Check array type - if (strcmp(array, "matrix") != 0) - { - return -1; - } - - // Check coord - if (strcmp(coord, "coordinate") != 0) - { - return -1; - } - - // Check data - if (strcmp(data, "real") != 0) - { - return -1; - } - - // Check type - if (strcmp(type, "general") != 0 && - strcmp(type, "symmetric") != 0) - { - return -1; - } - - // Symmetric flag - int symm = !strcmp(type, "symmetric"); - - // Skip comments - while(fgets(line, 1024, f)) - { - if (line[0] != '%') - { - break; - } - } - - // Read dimensions - int snnz; - - sscanf(line, "%d %d %d", &nrow, &ncol, &snnz); - nnz = symm ? (snnz - nrow) * 2 + nrow : snnz; - - *row = (int*) malloc(sizeof(int)*nnz); - *col = (int*) malloc(sizeof(int)*nnz); - *val = (T*) malloc(sizeof(T)*nnz); - - // Read entries - int idx = 0; - while(fgets(line, 1024, f)) - { - int irow; - int icol; - double dval; - - sscanf(line, "%d %d %lf", &irow, &icol, &dval); - - --irow; - --icol; - - (*row)[idx] = irow; - (*col)[idx] = icol; - (*val)[idx] = (T) dval; - - ++idx; - - if (symm && irow != icol) { - - (*row)[idx] = icol; - (*col)[idx] = irow; - (*val)[idx] = (T) dval; - - ++idx; - - } - - } - - fclose(f); - - return 0; -} - -template -inline void coo_to_csr(int nrow, int ncol, int nnz, - const int *src_row, const int *src_col, const T *src_val, - int **dst_ptr, int **dst_col, T **dst_val) -{ - *dst_ptr = (int*) malloc(sizeof(int)*(nrow+1)); - *dst_col = (int*) malloc(sizeof(int)*nnz); - *dst_val = (T*) malloc(sizeof(T)*nnz); - - memset(*dst_ptr, 0, sizeof(int)*(nrow+1)); - - // Compute nnz entries per row - for (int i=0; i Date: Fri, 4 May 2018 13:17:06 +0200 Subject: [PATCH 024/304] csrmv example and laplacian matrix generator --- clients/include/utility.hpp | 63 ++++++++++++++ clients/samples/CMakeLists.txt | 13 ++- clients/samples/example_csrmv.cpp | 140 ++++++++++++++++++++++++++++++ 3 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 clients/samples/example_csrmv.cpp diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index 859765ce..bee38f94 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -113,6 +113,69 @@ void rocsparse_init_index(std::vector &x, rocsparse_int nnz, std::sort(x.begin(), x.end()); }; +/* ============================================================================================ */ +/*! \brief Generate 2D laplacian on unit square in CSR format */ +template +rocsparse_int gen_2d_laplacian(rocsparse_int ndim, + std::vector &rowptr, + std::vector &col, + std::vector &val) +{ + rocsparse_int n = ndim * ndim; + rocsparse_int nnz_mat = n * 5 - ndim * 4; + + rowptr.resize(n+1); + col.resize(nnz_mat); + val.resize(nnz_mat); + + rocsparse_int nnz = 0; + + // Fill local arrays + for (rocsparse_int i=0; i(-1); + ++nnz; + } + // if no left boundary element, connect with left neighbor + if (j != 0) + { + col[nnz] = idx - 1; + val[nnz] = static_cast(-1); + ++nnz; + } + // element itself + col[nnz] = idx; + val[nnz] = static_cast(4); + ++nnz; + // if no right boundary element, connect with right neighbor + if (j != ndim - 1) + { + col[nnz] = idx + 1; + val[nnz] = static_cast(-1); + ++nnz; + } + // if no lower boundary element, connect with lower neighbor + if (i != ndim - 1) + { + col[nnz] = idx + ndim; + val[nnz] = static_cast(-1); + ++nnz; + } + } + } + rowptr[n] = nnz; + + return n; +} + #ifdef __cplusplus extern "C" { #endif diff --git a/clients/samples/CMakeLists.txt b/clients/samples/CMakeLists.txt index 4ab7e568..85e56f8f 100644 --- a/clients/samples/CMakeLists.txt +++ b/clients/samples/CMakeLists.txt @@ -2,10 +2,20 @@ # Copyright 2018 Advanced Micro Devices, Inc. # ######################################################################## +set(ROCSPARSE_CLIENTS_COMMON + ../common/utility.cpp +) + # Function to add rocsparse examples function(add_rocsparse_example EXAMPLE_SOURCE) get_filename_component(EXAMPLE_TARGET ${EXAMPLE_SOURCE} NAME_WE) - add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCE}) + add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCE} ${ROCSPARSE_CLIENTS_COMMON}) + + target_include_directories(${EXAMPLE_TARGET} + PRIVATE + $ + ) + target_link_libraries(${EXAMPLE_TARGET} PRIVATE rocsparse @@ -25,3 +35,4 @@ endfunction() # Examples add_rocsparse_example(example_handle.cpp) +add_rocsparse_example(example_csrmv.cpp) diff --git a/clients/samples/example_csrmv.cpp b/clients/samples/example_csrmv.cpp new file mode 100644 index 00000000..687eeb11 --- /dev/null +++ b/clients/samples/example_csrmv.cpp @@ -0,0 +1,140 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "utility.hpp" + +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + // Parse command line + if (argc < 2) + { + fprintf(stderr, "%s [ ]\n", argv[0]); + return -1; + } + + int ndim = atoi(argv[1]); + int trials = 200; + int batch_size = 1; + + if (argc > 2) + { + trials = atoi(argv[2]); + } + if (argc > 3) + { + batch_size = atoi(argv[3]); + } + + // rocSPARSE handle + rocsparse_handle handle; + rocsparse_create_handle(&handle); + + hipDeviceProp_t devProp; + int device_id = 0; + + hipGetDevice(&device_id); + hipGetDeviceProperties(&devProp, device_id); + printf("Device: %s\n", devProp.name); + + // Generate problem + std::vector hAptr; + std::vector hAcol; + std::vector hAval; + int nrow = gen_2d_laplacian(ndim, hAptr, hAcol, hAval); + int ncol = nrow; + int nnz = hAptr[nrow]; + + // Sample some random data + srand(12345ULL); + + double halpha = (double) rand() / RAND_MAX; + double hbeta = 0.0; + + std::vector hx(nrow); + rocsparse_init(hx, 1, nrow); + + // Matrix descriptor + rocsparse_mat_descr descrA; + rocsparse_create_mat_descr(&descrA); + + // Offload data to device + int *dAptr = NULL; + int *dAcol = NULL; + double *dAval = NULL; + double *dx = NULL; + double *dy = NULL; + + hipMalloc((void**) &dAptr, sizeof(int)*(nrow+1)); + hipMalloc((void**) &dAcol, sizeof(int)*nnz); + hipMalloc((void**) &dAval, sizeof(double)*nnz); + hipMalloc((void**) &dx, sizeof(double)*nrow); + hipMalloc((void**) &dy, sizeof(double)*nrow); + + hipMemcpy(dAptr, hAptr.data(), sizeof(int)*(nrow+1), hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(int)*nnz, hipMemcpyHostToDevice); + hipMemcpy(dAval, hAval.data(), sizeof(double)*nnz, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double)*nrow, hipMemcpyHostToDevice); + + // Warm up + for (int i=0; i<10; ++i) + { + // Call rocsparse csrmv + rocsparse_dcsrmv(handle, rocsparse_operation_none, + nrow, ncol, nnz, + &halpha, descrA, + dAval, dAptr, dAcol, + dx, &hbeta, dy); + } + + // Device synchronization + hipDeviceSynchronize(); + + // Start time measurement + double time = get_time_us(); + + // CSR matrix vector multiplication + for (int i=0; i(sizeof(double)*(2*nrow+nnz) + +sizeof(rocsparse_int)*(nrow+1+nnz))/time/1e6; + double gflops = static_cast(2*nnz)/time/1e6; + printf("nrow\t\tncol\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tusec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + nrow, ncol, nnz, halpha, hbeta, gflops, bandwidth, time); + + + + + // Clear up on device + hipFree(dAptr); + hipFree(dAcol); + hipFree(dAval); + hipFree(dx); + hipFree(dy); + + rocsparse_destroy_mat_descr(descrA); + rocsparse_destroy_handle(handle); + + return 0; +} From b0b3c4037d607e00814be6f4bac39ec807a3d873 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 6 May 2018 13:49:21 +0200 Subject: [PATCH 025/304] csrmv testing and benchmark --- clients/benchmarks/client.cpp | 43 ++- .../rocsparse_template_specialization.cpp | 40 +++ clients/include/rocsparse.hpp | 15 + clients/include/rocsparse_test_unique_ptr.hpp | 16 + clients/include/testing_axpyi.hpp | 2 +- clients/include/testing_csrmv.hpp | 304 ++++++++++++------ clients/include/utility.hpp | 252 ++++++++++++++- clients/tests/test_csrmv.cpp | 23 +- library/src/level2/rocsparse_csrmv.cpp | 2 +- 9 files changed, 573 insertions(+), 124 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 52db4c85..30482a40 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -4,6 +4,7 @@ #include "utility.hpp" #include "rocsparse.hpp" +#include "testing_csrmv.hpp" #include "testing_axpyi.hpp" #include @@ -26,21 +27,34 @@ int main(int argc, char *argv[]) po::options_description desc("rocsparse client command line options"); desc.add_options()("help,h", "produces this help message") + ("sizem,m", + po::value(&argus.M)->default_value(128), + "Specific matrix size testing: sizem is only applicable to SPARSE-2 " + "& SPARSE-3: the number of rows.") + ("sizen,n", po::value(&argus.N)->default_value(128), - "Specific vector size testing, LEVEL-1: the length of the dense vector.") + "Specific matrix/vector size testing: SPARSE-1: the length of the " + "dense vector. SPARSE-2 & SPARSE-3: the number of columns") ("sizennz,z", po::value(&argus.nnz)->default_value(32), "Specific vector size testing, LEVEL-1: the number of non-zero elements " "of the sparse vector.") + ("mtx", + po::value(&argus.filename)->default_value(""), "read from matrix " + "market (.mtx) format. This will override parameters m, n, and z") + ("alpha", po::value(&argus.alpha)->default_value(1.0), "specifies the scalar alpha") + ("beta", + po::value(&argus.beta)->default_value(0.0), "specifies the scalar beta") + ("function,f", po::value(&function)->default_value("axpyi"), - "SPARSE function to test. Options: axpyi") + "SPARSE function to test. Options: axpyi, csrmv") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") @@ -73,12 +87,6 @@ int main(int argc, char *argv[]) return -1; } - if (argus.nnz >= argus.N) - { - fprintf(stderr, "Number of non-zeros should be less than vector length\n"); - return -1; - } - // Device Query rocsparse_int device_count = query_device_property(); @@ -94,23 +102,26 @@ int main(int argc, char *argv[]) /* ============================================================================================ */ -// if(argus.M < 0 || argus.N < 0 || argus.K < 0) -// { -// fprintf(stderr, "Invalid matrix dimension\n"); -// } - if (argus.N < 0) + if (argus.M < 0 || argus.N < 0) { fprintf(stderr, "Invalid dimension\n"); return -1; } - if(function == "axpyi") + if (function == "axpyi") { - if(precision == 's') + if (precision == 's') testing_axpyi(argus); - else if(precision == 'd') + else if (precision == 'd') testing_axpyi(argus); } + else if (function == "csrmv") + { + if (precision == 's') + testing_csrmv(argus); + else if (precision == 'd') + testing_csrmv(argus); + } else { fprintf(stderr, "Invalid value for --function\n"); diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 68c22eb6..abc9f8e4 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -32,4 +32,44 @@ rocsparse_status rocsparse_axpyi(rocsparse_handle handle, return rocsparse_daxpyi(handle, nnz, alpha, xVal, xInd, y, idxBase); } +template <> +rocsparse_status rocsparse_csrmv(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float *alpha, + const rocsparse_mat_descr descrA, + const float *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const float *x, + const float *beta, + float *y) +{ + return rocsparse_scsrmv(handle, transA, m, n, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y); +} + +template <> +rocsparse_status rocsparse_csrmv(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double *alpha, + const rocsparse_mat_descr descrA, + const double *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const double *x, + const double *beta, + double *y) +{ + return rocsparse_dcsrmv(handle, transA, m, n, nnz, alpha, + descrA, csrValA, csrRowPtrA, csrColIndA, + x, beta, y); +} + } // namespace rocsparse diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index abe34782..834740f8 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -19,6 +19,21 @@ rocsparse_status rocsparse_axpyi(rocsparse_handle handle, T *y, rocsparse_index_base idxBase); +template +rocsparse_status rocsparse_csrmv(rocsparse_handle handle, + rocsparse_operation transA, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T *alpha, + const rocsparse_mat_descr descrA, + const T *csrValA, + const rocsparse_int *csrRowPtrA, + const rocsparse_int *csrColIndA, + const T *x, + const T *beta, + T *y); + } #endif // _ROCSPARSE_HPP_ diff --git a/clients/include/rocsparse_test_unique_ptr.hpp b/clients/include/rocsparse_test_unique_ptr.hpp index 82225838..2bce602a 100644 --- a/clients/include/rocsparse_test_unique_ptr.hpp +++ b/clients/include/rocsparse_test_unique_ptr.hpp @@ -57,6 +57,22 @@ struct handle_struct } }; +struct descr_struct +{ + rocsparse_mat_descr descr; + descr_struct() + { + rocsparse_status status = rocsparse_create_mat_descr(&descr); + verify_rocsparse_status_success(status, "ERROR: descr_struct constructor"); + } + + ~descr_struct() + { + rocsparse_status status = rocsparse_destroy_mat_descr(descr); + verify_rocsparse_status_success(status, "ERROR: descr_struct destructor"); + } +}; + } // namespace rocsparse_test using rocsparse_unique_ptr = std::unique_ptr; diff --git a/clients/include/testing_axpyi.hpp b/clients/include/testing_axpyi.hpp index ed52e788..2faea25a 100644 --- a/clients/include/testing_axpyi.hpp +++ b/clients/include/testing_axpyi.hpp @@ -136,7 +136,7 @@ rocsparse_status testing_axpyi(Arguments argus) // Initial Data on CPU srand(12345ULL); - rocsparse_init_index(hxInd, nnz, 1, N); + rocsparse_init_index(hxInd.data(), nnz, 1, N); rocsparse_init(hxVal, 1, nnz); rocsparse_init(hy_1, 1, N); diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index 221ec4b5..5918b9a3 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -11,9 +11,10 @@ #include "utility.hpp" #include "unit.hpp" +#include #include -typedef rocsparse_index_base base; +typedef rocsparse_operation op; using namespace rocsparse; using namespace rocsparse_test; @@ -21,60 +22,106 @@ using namespace rocsparse_test; template void testing_csrmv_bad_arg(void) { + I n = 100; + I m = 100; I nnz = 100; I safe_size = 100; T alpha = 0.6; - base idxBase = rocsparse_index_base_zero; + T beta = 0.2; + + op trans = rocsparse_operation_none; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); rocsparse_handle handle = unique_ptr_handle->handle; - auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), - device_free}; - auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), - device_free}; + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), + device_free}; + auto dcol_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), + device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), + device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), + device_free}; auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), device_free}; - T *dxVal = (T*) dxVal_managed.get(); - I *dxInd = (I*) dxInd_managed.get(); - T *dy = (T*) dy_managed.get(); + I *dptr = (I*) dptr_managed.get(); + I *dcol = (I*) dcol_managed.get(); + T *dval = (T*) dval_managed.get(); + T *dx = (T*) dx_managed.get(); + T *dy = (T*) dy_managed.get(); - if(!dxInd || !dxVal || !dy) + if(!dval || !dptr || !dcol || !dx || !dy) { PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); return; } - // testing for (nullptr == dxInd) + // testing for (nullptr == dptr) + { + I *dptr_null = nullptr; + status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, + dval, dptr_null, dcol, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); + } + // testing for (nullptr == dcol) { - I *dxInd_null = nullptr; - status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd_null, dy, idxBase); - verify_rocsparse_status_invalid_pointer(status, "Error: xInd is nullptr"); + I *dcol_null = nullptr; + status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, + dval, dptr, dcol_null, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } - // testing for (nullptr == dxVal) + // testing for (nullptr == dval) { - T *dxVal_null = nullptr; - status = rocsparse_axpyi(handle, nnz, &alpha, dxVal_null, dxInd, dy, idxBase); - verify_rocsparse_status_invalid_pointer(status, "Error: xVal is nullptr"); + T *dval_null = nullptr; + status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, + dval_null, dptr, dcol, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); + } + // testing for (nullptr == dx) + { + T *dx_null = nullptr; + status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, + dval, dptr, dcol, dx_null, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); } // testing for (nullptr == dy) { T *dy_null = nullptr; - status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd, dy_null, idxBase); - verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); + status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, + dval, dptr, dcol, dx, &beta, dy_null); + verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); } // testing for (nullptr == d_alpha) { T *d_alpha_null = nullptr; - status = rocsparse_axpyi(handle, nnz, d_alpha_null, dxVal, dxInd, dy, idxBase); + status = rocsparse_csrmv(handle, trans, m, n, nnz, d_alpha_null, descr, + dval, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } + // testing for (nullptr == d_beta) + { + T *d_beta_null = nullptr; + status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, + dval, dptr, dcol, dx, d_beta_null, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); + } + // testing for (nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr_null, + dval, dptr, dcol, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } // testing for (nullptr == handle) { rocsparse_handle handle_null = nullptr; - status = rocsparse_axpyi(handle_null, nnz, &alpha, dxVal, dxInd, dy, idxBase); + status = rocsparse_csrmv(handle_null, trans, m, n, nnz, &alpha, descr, + dval, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_handle(status); } } @@ -82,165 +129,238 @@ void testing_csrmv_bad_arg(void) template rocsparse_status testing_csrmv(Arguments argus) { - I N = argus.N; - I nnz = argus.nnz; - I safe_size = 100; - T h_alpha = argus.alpha; - rocsparse_index_base idxBase = argus.idxBase; + I safe_size = 100; + I nrow = argus.M; + I ncol = argus.N; + I nnz = argus.nnz == 32 ? nrow * 0.02 * ncol : argus.nnz; // 2% non zeros + T h_alpha = argus.alpha; + T h_beta = argus.beta; + op trans = argus.trans; rocsparse_status status; std::unique_ptr test_handle(new handle_struct); rocsparse_handle handle = test_handle->handle; + std::unique_ptr test_descr(new descr_struct); + rocsparse_mat_descr descr = test_descr->descr; + // Argument sanity check before allocating invalid memory - if(nnz <= 0) + if(nrow <= 0 || ncol <= 0 || nnz <= 0) { - auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I) * safe_size), - device_free}; - auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), - device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), - device_free}; - - I *dxInd = (I*) dxInd_managed.get(); - T *dxVal = (T*) dxVal_managed.get(); - T *dy = (T*) dy_managed.get(); - - if(!dxInd || !dxVal || !dy) + auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), + device_free}; + auto dcol_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), + device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), + device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), + device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), + device_free}; + + I *dptr = (I*) dptr_managed.get(); + I *dcol = (I*) dcol_managed.get(); + T *dval = (T*) dval_managed.get(); + T *dx = (T*) dx_managed.get(); + T *dy = (T*) dy_managed.get(); + + if (!dval || !dptr || !dcol || !dx || !dy) { - verify_rocsparse_status_success(rocsparse_status_memory_error, "!dxInd || !dxVal || !dy"); + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dptr || !dcol || !dval || !dx || !dy"); return rocsparse_status_memory_error; } - CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - status = rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy, idxBase); + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, + rocsparse_pointer_mode_host)); + status = rocsparse_csrmv(handle, trans, nrow, ncol, nnz, &h_alpha, + descr, dval, dptr, dcol, dx, &h_beta, dy); - if (nnz < 0) + if (nrow < 0 || ncol < 0 || nnz < 0) { - verify_rocsparse_status_invalid_size(status, "Error: nnz < 0"); + verify_rocsparse_status_invalid_size(status, "Error: nrow < 0 || " + "ncol < 0 || nnz < 0"); } else { - verify_rocsparse_status_success(status, "nnz == 0"); + verify_rocsparse_status_success(status, "nrow >= 0 && ncol >= 0 && nnz >= 0"); } return rocsparse_status_success; } // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - std::vector hxInd(nnz); - std::vector hxVal(nnz); - std::vector hy_1(N); - std::vector hy_2(N); - std::vector hy_gold(N); + std::vector hptr(nrow+1); + std::vector hcol(nnz); + std::vector hval(nnz); + std::vector hx(ncol); + std::vector hy_1(nrow); + std::vector hy_2(nrow); + std::vector hy_gold(nrow); // Initial Data on CPU srand(12345ULL); - rocsparse_init_index(hxInd, nnz, 1, N); - rocsparse_init(hxVal, 1, nnz); - rocsparse_init(hy_1, 1, N); + if (argus.filename != "") + { + std::vector coo_row; + std::vector coo_col; + std::vector coo_val; + + if (read_mtx_matrix(argus.filename.c_str(), + nrow, ncol, nnz, + coo_row, coo_col, coo_val) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + + coo_to_csr(nrow, ncol, nnz, + coo_row, coo_col, coo_val, + hptr, hcol, hval); + coo_row.clear(); + coo_col.clear(); + coo_val.clear(); + hx.resize(ncol); + hy_1.resize(nrow); + hy_2.resize(nrow); + hy_gold.resize(nrow); + } + else + { + rocsparse_init_csr(hptr, hcol, hval, nrow, ncol, nnz); + } + + rocsparse_init(hx, 1, ncol); + rocsparse_init(hy_1, 1, nrow); // copy vector is easy in STL; hy_gold = hx: save a copy in hy_gold which will be output of CPU hy_2 = hy_1; hy_gold = hy_1; // allocate memory on device - auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*nnz), - device_free}; - auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nnz), - device_free}; - auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*N), + auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*(nrow+1)), + device_free}; + auto dcol_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*nnz), + device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nnz), device_free}; - auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*N), + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*ncol), + device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nrow), + device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nrow), device_free}; auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto d_beta_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), + device_free}; - I *dxInd = (I*) dxInd_managed.get(); - T *dxVal = (T*) dxVal_managed.get(); - T *dy_1 = (T*) dy_1_managed.get(); - T *dy_2 = (T*) dy_2_managed.get(); + I *dptr = (I*) dptr_managed.get(); + I *dcol = (I*) dcol_managed.get(); + T *dval = (T*) dval_managed.get(); + T *dx = (T*) dx_managed.get(); + T *dy_1 = (T*) dy_1_managed.get(); + T *dy_2 = (T*) dy_2_managed.get(); T *d_alpha = (T*) d_alpha_managed.get(); + T *d_beta = (T*) d_beta_managed.get(); - if(!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha) + if(!dval || !dptr || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta) { verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha"); + "!dval || !dptr || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta"); return rocsparse_status_memory_error; } // copy data from CPU to device - CHECK_HIP_ERROR(hipMemcpy(dxInd, hxInd.data(), sizeof(I) * nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dxVal, hxVal.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dptr, hptr.data(), sizeof(I)*(nrow+1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcol, hcol.data(), sizeof(I)*nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dval, hval.data(), sizeof(T)*nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T)*ncol, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T)*nrow, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); double gpu_time_used, cpu_time_used; double rocsparse_gflops, cpu_gflops, rocsparse_bandwidth; if(argus.unit_check) { - CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * N, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T)*nrow, hipMemcpyHostToDevice)); // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idxBase)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrmv(handle, trans, nrow, ncol, nnz, &h_alpha, + descr, dval, dptr, dcol, dx, &h_beta, dy_1)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); - CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idxBase)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrmv(handle, trans, nrow, ncol, nnz, d_alpha, + descr, dval, dptr, dcol, dx, d_beta, dy_2)); // copy output from device to CPU - CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * N, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * N, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T)*nrow, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T)*nrow, hipMemcpyDeviceToHost)); // CPU cpu_time_used = get_time_us(); - for (int i=0; i +#include +#include #include #include #include @@ -95,7 +97,7 @@ void rocsparse_init(std::vector& A, rocsparse_int M, rocsparse_int N) /*! \brief vector initialization: */ // initialize sparse index vector with nnz entries ranging from start to end template -void rocsparse_init_index(std::vector &x, rocsparse_int nnz, +void rocsparse_init_index(I *x, rocsparse_int nnz, rocsparse_int start, rocsparse_int end) { std::vector check(end-start, false); @@ -110,9 +112,40 @@ void rocsparse_init_index(std::vector &x, rocsparse_int nnz, ++num; } } - std::sort(x.begin(), x.end()); + std::sort(x, x+nnz); }; +/* ============================================================================================ */ +/*! \brief csr matrix initialization */ +template +void rocsparse_init_csr(std::vector &ptr, std::vector &col, + std::vector &val, + rocsparse_int nrow, rocsparse_int ncol, rocsparse_int nnz) +{ + // Row offsets + ptr[0] = 0; + ptr[nrow] = nnz; + + for (rocsparse_int i=1; i(); + } +} + /* ============================================================================================ */ /*! \brief Generate 2D laplacian on unit square in CSR format */ template @@ -121,6 +154,10 @@ rocsparse_int gen_2d_laplacian(rocsparse_int ndim, std::vector &col, std::vector &val) { + if (ndim == 0) { + return 0; + } + rocsparse_int n = ndim * ndim; rocsparse_int nnz_mat = n * 5 - ndim * 4; @@ -176,6 +213,207 @@ rocsparse_int gen_2d_laplacian(rocsparse_int ndim, return n; } +/* ============================================================================================ */ +/*! \brief Read matrix from mtx file in COO format */ +template +rocsparse_int read_mtx_matrix(const char *filename, + rocsparse_int &nrow, + rocsparse_int &ncol, + rocsparse_int &nnz, + std::vector &row, + std::vector &col, + std::vector &val) +{ + FILE *f = fopen(filename, "r"); + if (!f) + { + return -1; + } + + char line[1024]; + + // Check for banner + if (!fgets(line, 1024, f)) + { + return -1; + } + + char banner[16]; + char array[16]; + char coord[16]; + char data[16]; + char type[16]; + + // Extract banner + if (sscanf(line, "%s %s %s %s %s", banner, array, coord, data, type) != 5) + { + return -1; + } + + // Convert to lower case + for (char *p=array; *p!='\0'; *p=tolower(*p), p++); + for (char *p=coord; *p!='\0'; *p=tolower(*p), p++); + for (char *p=data; *p!='\0'; *p=tolower(*p), p++); + for (char *p=type; *p!='\0'; *p=tolower(*p), p++); + + // Check banner + if (strncmp(line, "%%MatrixMarket", 14) != 0) + { + return -1; + } + + // Check array type + if (strcmp(array, "matrix") != 0) + { + return -1; + } + + // Check coord + if (strcmp(coord, "coordinate") != 0) + { + return -1; + } + + // Check data + if (strcmp(data, "real") != 0) + { + return -1; + } + + // Check type + if (strcmp(type, "general") != 0 && + strcmp(type, "symmetric") != 0) + { + return -1; + } + + // Symmetric flag + rocsparse_int symm = !strcmp(type, "symmetric"); + + // Skip comments + while(fgets(line, 1024, f)) + { + if (line[0] != '%') + { + break; + } + } + + // Read dimensions + rocsparse_int snnz; + + sscanf(line, "%d %d %d", &nrow, &ncol, &snnz); + nnz = symm ? (snnz - nrow) * 2 + nrow : snnz; + + row.resize(nnz); + col.resize(nnz); + val.resize(nnz); + + // Read entries + rocsparse_int idx = 0; + while(fgets(line, 1024, f)) + { + rocsparse_int irow; + rocsparse_int icol; + double dval; + + sscanf(line, "%d %d %lf", &irow, &icol, &dval); + + --irow; + --icol; + + row[idx] = irow; + col[idx] = icol; + val[idx] = (T) dval; + + ++idx; + + if (symm && irow != icol) { + + row[idx] = icol; + col[idx] = irow; + val[idx] = (T) dval; + + ++idx; + + } + + } + fclose(f); + + return 0; +} + +/* ============================================================================================ */ +/*! \brief Convert matrix from COO to CSR format */ +template +void coo_to_csr(rocsparse_int nrow, rocsparse_int ncol, rocsparse_int nnz, + const std::vector &src_row, + const std::vector &src_col, + const std::vector &src_val, + std::vector &dst_ptr, + std::vector &dst_col, + std::vector &dst_val) +{ + dst_ptr.resize(nrow+1, 0); + dst_col.resize(nnz); + dst_val.resize(nnz); + + // Compute nnz entries per row + for (rocsparse_int i=0; i #include -typedef rocsparse_index_base base; -typedef std::tuple csrmv_tuple; +typedef std::tuple csrmv_tuple; -int csr_N_range[] = {12000, 15332, 22031}; -int csr_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; -std::vector csr_alpha_range = {1.0, 0.0}; -base csr_idxBase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +int csr_M_range[] = {-1, 0, 10, 500, 7111, 10000}; +int csr_N_range[] = {-3, 0, 33, 842, 4441, 10000}; +std::vector csr_alpha_range = {2.0, 3.0}; +std::vector csr_beta_range = {0.0, 1.0}; class parameterized_csrmv : public testing::TestWithParam { @@ -29,10 +28,10 @@ class parameterized_csrmv : public testing::TestWithParam Arguments setup_csrmv_arguments(csrmv_tuple tup) { Arguments arg; - arg.N = std::get<0>(tup); - arg.nnz = std::get<1>(tup); + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); arg.alpha = std::get<2>(tup); - arg.idxBase = std::get<3>(tup); + arg.beta = std::get<3>(tup); arg.timing = 0; return arg; } @@ -57,7 +56,7 @@ TEST_P(parameterized_csrmv, csrmv_double) } INSTANTIATE_TEST_CASE_P(csrmv, parameterized_csrmv, - testing::Combine(testing::ValuesIn(csr_N_range), - testing::ValuesIn(csr_nnz_range), + testing::Combine(testing::ValuesIn(csr_M_range), + testing::ValuesIn(csr_N_range), testing::ValuesIn(csr_alpha_range), - testing::ValuesIn(csr_idxBase_range))); + testing::ValuesIn(csr_beta_range))); diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 90f58c52..9d840e36 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -102,7 +102,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if (descrA == nullptr) { - return rocsparse_status_invalid_handle; + return rocsparse_status_invalid_pointer; } // Logging TODO bench logging From a1f7b1f4879f6faf336b6a7949d86beb411edbea Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 6 May 2018 13:52:32 +0200 Subject: [PATCH 026/304] apache license text added to csrmv_device.h --- library/src/level2/csrmv_device.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index cc74820e..192303ac 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -1,3 +1,19 @@ +/* ************************************************************************ +* Copyright 2015 Vratis, Ltd. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* ************************************************************************ */ + #include // Knuth's Two-Sum algorithm, which allows us to add together two floating From 4326ddc00e9913b1078918f207367f03ecf3eb48 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 7 May 2018 16:36:59 +0200 Subject: [PATCH 027/304] hybmv added (pure ELL support only) --- library/include/rocsparse-auxiliary.h | 28 +- library/include/rocsparse-functions.h | 49 ++++ library/include/rocsparse-types.h | 15 +- library/src/CMakeLists.txt | 2 + library/src/conversion/csr2hyb_device.h | 57 +++++ library/src/conversion/rocsparse_csr2hyb.cpp | 233 +++++++++++++++++ library/src/include/handle.h | 34 +++ library/src/level2/csrmv_device.h | 6 + library/src/level2/ellmv_device.h | 50 ++++ library/src/level2/rocsparse_hybmv.cpp | 254 +++++++++++++++++++ library/src/rocsparse_auxiliary.cpp | 59 ++++- 11 files changed, 772 insertions(+), 15 deletions(-) create mode 100644 library/src/conversion/csr2hyb_device.h create mode 100644 library/src/conversion/rocsparse_csr2hyb.cpp create mode 100644 library/src/level2/ellmv_device.h create mode 100644 library/src/level2/rocsparse_hybmv.cpp diff --git a/library/include/rocsparse-auxiliary.h b/library/include/rocsparse-auxiliary.h index 64aec34d..2ad60733 100644 --- a/library/include/rocsparse-auxiliary.h +++ b/library/include/rocsparse-auxiliary.h @@ -31,32 +31,32 @@ ROCSPARSE_EXPORT rocsparse_status rocsparse_create_handle(rocsparse_handle *handle); /******************************************************************************** - * \brief destroy handle + * \brief Destroy handle. *******************************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle); /******************************************************************************** - * \brief remove any streams from handle, and add one + * \brief Remove any streams from handle, and add one. *******************************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_set_stream(rocsparse_handle handle, hipStream_t stream); /******************************************************************************** - * \brief get stream [0] from handle + * \brief Get stream [0] from handle. *******************************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t *stream); /******************************************************************************** - * \brief set rocsparse_pointer_mode + * \brief Set rocsparse_pointer_mode. *******************************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, rocsparse_pointer_mode pointer_mode); /******************************************************************************** - * \brief get rocsparse_pointer_mode + * \brief Get rocsparse_pointer_mode. *******************************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, @@ -82,7 +82,7 @@ ROCSPARSE_EXPORT rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descrA); /******************************************************************************** - * \brief destroy matrix descriptor + * \brief Destroy the matrix descriptor. *******************************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descrA); @@ -113,6 +113,22 @@ rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descrA, ROCSPARSE_EXPORT rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descrA); +/******************************************************************************** + * \brief rocsparse_create_hyb_mat is a structure holding the rocsparse HYB + * matrix. It must be initialized using rocsparse_create_hyb_mat() + * and the retured handle must be passed to all subsequent library function + * calls that involve the HYB matrix. + * It should be destroyed at the end using rocsparse_destroy_hyb_mat(). + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat *hyb); + +/******************************************************************************** + * \brief Destroy HYB matrix. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb); + #ifdef __cplusplus } #endif diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 090b6120..c6bd5865 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -178,6 +178,55 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, const rocsparse_double_complex *beta, rocsparse_double_complex *y); */ + +// TODO +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const float *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type); + +// TODO +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const double *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type); + +// TODO +ROCSPARSE_EXPORT +rocsparse_status rocsparse_shybmv(rocsparse_handle handle, + rocsparse_operation trans, + const float *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const float *x, + const float *beta, + float *y); + +// TODO +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, + rocsparse_operation trans, + const double *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const double *x, + const double *beta, + double *y); + /* * =========================================================================== * level 3 SPARSE diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index c21bea10..36070fcc 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -22,6 +22,7 @@ typedef int32_t rocsparse_int; typedef struct _rocsparse_handle *rocsparse_handle; typedef struct _rocsparse_mat_descr *rocsparse_mat_descr; +typedef struct _rocsparse_hyb_mat *rocsparse_hyb_mat; #ifdef __cplusplus extern "C" { @@ -46,11 +47,19 @@ typedef enum rocsparse_index_base_ { /*! \brief Used to specify the matrix type. */ typedef enum rocsparse_matrix_type_ { - rocsparse_matrix_type_general = 0, - rocsparse_matrix_type_symmetric = 1, - rocsparse_matrix_type_hermitian = 2 + rocsparse_matrix_type_general = 0, + rocsparse_matrix_type_symmetric = 1, + rocsparse_matrix_type_hermitian = 2, + rocsparse_matrix_type_triangular = 3 } rocsparse_matrix_type; +/*! \brief HYB matrix partition type. */ +typedef enum rocsparse_hyb_partition_ { + rocsparse_hyb_partition_auto = 0, + rocsparse_hyb_partition_user = 1, + rocsparse_hyb_partition_max = 2 +} rocsparse_hyb_partition; + /* ============================================================================================ */ /** * @brief rocsparse status codes definition diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 88655ac9..d25ff4ef 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -9,4 +9,6 @@ set(rocsparse_source src/rocsparse_auxiliary.cpp src/level1/rocsparse_axpyi.cpp src/level2/rocsparse_csrmv.cpp + src/level2/rocsparse_hybmv.cpp + src/conversion/rocsparse_csr2hyb.cpp ) diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h new file mode 100644 index 00000000..fb6ff48e --- /dev/null +++ b/library/src/conversion/csr2hyb_device.h @@ -0,0 +1,57 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef CSR2HYB_DEVICE_H +#define CSR2HYB_DEVICE_H + +#include "handle.h" + +#include + +template +__device__ +void csr2ell_device(rocsparse_int m, + const T *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_int ell_width, + rocsparse_int *ell_col_ind, + T *ell_val) +{ + rocsparse_int ai = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if (ai >= m) + { + return; + } + + rocsparse_int p = 0; + rocsparse_int aj = csr_row_ptr[ai]; + + // Fill ELL matrix + for (; aj= ell_width) + { + break; + } + + rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); + ell_col_ind[idx] = csr_col_ind[aj]; + ell_val[idx] = csr_val[aj]; + } + + // TODO store rownnz + + // Pad remaining ELL structure + for (; aj(0); + } +} + +#endif // CSR2HYB_DEVICE_H diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp new file mode 100644 index 00000000..08ed46ee --- /dev/null +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -0,0 +1,233 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "csr2hyb_device.h" + +#include + +template +__global__ +void csr2ell_kernel(rocsparse_int m, + const T *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_int ell_width, + rocsparse_int *ell_col_ind, + T *ell_val) +{ + csr2ell_device(m, csr_val, csr_row_ptr, csr_col_ind, + ell_width, ell_col_ind, ell_val); +} + +template +rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const T *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type) +{ + // Check for valid handle and matrix descriptor + if (handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if (descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (hyb == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + log_trace(handle, + replaceX("rocsparse_Xcsr2hyb"), + m, + n, + (const void*&) descr, + (const void*&) csr_val, + (const void*&) csr_row_ptr, + (const void*&) csr_col_ind, + (const void*&) hyb, + user_ell_width, + partition_type); + + // Check matrix type + if (descr->base != rocsparse_index_base_zero) + { + // TODO + return rocsparse_status_not_implemented; + } + if (descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + if (partition_type != rocsparse_hyb_partition_max) + { + return rocsparse_status_not_implemented; + } + + // Check sizes + if (m < 0) + { + return rocsparse_status_invalid_size; + } + else if (n < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if (csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if (m == 0 || n == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // Clear HYB structure if already allocated + hyb->m = m; + hyb->n = n; + hyb->partition = partition_type; + hyb->ell_nnz = 0; + hyb->ell_width = 0; + hyb->coo_nnz = 0; + + if (hyb->ell_col_ind) + { + hipFree(hyb->ell_col_ind); + } + if (hyb->ell_val) + { + hipFree(hyb->ell_val); + } + if (hyb->coo_row_ind) + { + hipFree(hyb->coo_row_ind); + } + if (hyb->coo_col_ind) + { + hipFree(hyb->coo_col_ind); + } + if (hyb->coo_val) + { + hipFree(hyb->coo_val); + } + +#define CSR2ELL_DIM 256 + dim3 csr2ell_blocks((m-1)/CSR2ELL_DIM+1); + dim3 csr2ell_threads(CSR2ELL_DIM); + + //TODO we take max partition + if (partition_type == rocsparse_hyb_partition_max) + { + // ELL part only, compute maximum non-zeros per row + + //TODO reduction with rocPRIM to compute maxrow + rocsparse_int *hbuf = (rocsparse_int*) malloc(sizeof(rocsparse_int)*(m+1)); + hipMemcpy(hbuf, csr_row_ptr, sizeof(rocsparse_int)*(m+1), hipMemcpyDeviceToHost); + + for (rocsparse_int i=0; iell_width = rownnz > hyb->ell_width ? rownnz : hyb->ell_width; + } + free(hbuf); + // END TODO + } + else + { + // TODO + return rocsparse_status_not_implemented; + } + + // Compute ELL non-zeros + hyb->ell_nnz = hyb->ell_width * m; + + // Allocate ELL part + hipMalloc((void**) &hyb->ell_col_ind, sizeof(rocsparse_int)*hyb->ell_nnz); + hipMalloc(&hyb->ell_val, sizeof(T)*hyb->ell_nnz); + + + + + hipLaunchKernelGGL((csr2ell_kernel), + csr2ell_blocks, csr2ell_threads, 0, stream, + m, csr_val, csr_row_ptr, csr_col_ind, + hyb->ell_width, hyb->ell_col_ind, (T*) hyb->ell_val); + +#undef CSR2ELL_DIM + + + + + + + return rocsparse_status_success; +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" +rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const float *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type) +{ + return rocsparse_csr2hyb_template(handle, m, n, + descr, csr_val, csr_row_ptr, csr_col_ind, + hyb, user_ell_width, partition_type); +} + +extern "C" +rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const double *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type) +{ + return rocsparse_csr2hyb_template(handle, m, n, + descr, csr_val, csr_row_ptr, csr_col_ind, + hyb, user_ell_width, partition_type); +} diff --git a/library/src/include/handle.h b/library/src/include/handle.h index 45532b82..4033631c 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -70,4 +70,38 @@ struct _rocsparse_mat_descr rocsparse_index_base base = rocsparse_index_base_zero; }; +/******************************************************************************** + * \brief rocsparse_hyb_mat is a structure holding the rocsparse HYB matrix. + * It must be initialized using rocsparse_create_hyb_mat() and the returned + * handle must be passed to all subsequent library function calls that involve + * the HYB matrix. + * It should be destroyed at the end using rocsparse_destroy_hyb_mat(). + *******************************************************************************/ +struct _rocsparse_hyb_mat +{ + // num rows + rocsparse_int m = 0; + // num cols + rocsparse_int n = 0; + // partition type + rocsparse_hyb_partition partition = rocsparse_hyb_partition_auto; + // ELL matrix part + rocsparse_int ell_nnz = 0; + rocsparse_int ell_width = 0; + rocsparse_int *ell_col_ind = nullptr; + void *ell_val = nullptr; + // COO matrix part + rocsparse_int coo_nnz = 0; + rocsparse_int *coo_row_ind = nullptr; + rocsparse_int *coo_col_ind = nullptr; + void *coo_val = nullptr; +}; + +/******************************************************************************** + * \brief ELL format indexing + *******************************************************************************/ +#define ELL_IND_ROW(i, el, m, width) (el) * (m) + (i) +#define ELL_IND_EL (i, el, m, width) (el) + (width) * (i) +#define ELL_IND(i, el, m, width) ELL_IND_ROW(i, el, m, width) + #endif // HANDLE_H diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 192303ac..d1d5c579 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -14,6 +14,10 @@ * limitations under the License. * ************************************************************************ */ +#pragma once +#ifndef CSRMV_DEVICE_H +#define CSRMV_DEVICE_H + #include // Knuth's Two-Sum algorithm, which allows us to add together two floating @@ -213,3 +217,5 @@ void csrmvn_general_device(int num_rows, } } } + +#endif // CSRMV_DEVICE_H diff --git a/library/src/level2/ellmv_device.h b/library/src/level2/ellmv_device.h new file mode 100644 index 00000000..9c62f7bb --- /dev/null +++ b/library/src/level2/ellmv_device.h @@ -0,0 +1,50 @@ +#pragma once +#ifndef ELLMV_DEVICE_H +#define ELLMV_DEVICE_H + +#include "handle.h" + +#include + +template +static __device__ +void ellmvn_device(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + T alpha, + const rocsparse_int *ell_col_ind, + const T *ell_val, + const T *x, + T beta, + T *y) +{ + rocsparse_int ai = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + + if (ai >= m) + { + return; + } + + T sum = static_cast(0); + for (rocsparse_int p=0; p= 0 && col < n) + { + sum += ell_val[idx] * x[col]; + } + } + + if (beta != static_cast(0)) + { + y[ai] = beta * y[ai] + alpha * sum; + } + else + { + y[ai] = alpha * sum; + } +} + +#endif // ELLMV_DEVICE_H diff --git a/library/src/level2/rocsparse_hybmv.cpp b/library/src/level2/rocsparse_hybmv.cpp new file mode 100644 index 00000000..c6035876 --- /dev/null +++ b/library/src/level2/rocsparse_hybmv.cpp @@ -0,0 +1,254 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "ellmv_device.h" + +#include + +template +__global__ +void ellmvn_kernel_host_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + T alpha, + const rocsparse_int *ell_col_ind, + const T *ell_val, + const T *x, + T beta, + T *y) +{ + ellmvn_device(m, n, ell_width, alpha, ell_col_ind, ell_val, x, beta, y); +} + +template +__global__ +void ellmvn_kernel_device_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + const T *alpha, + const rocsparse_int *ell_col_ind, + const T *ell_val, + const T *x, + const T *beta, + T *y) +{ + ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y); +} + +template +rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, + rocsparse_operation trans, + const T *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const T *x, + const T *beta, + T *y) +{ + // Check for valid handle and matrix descriptor + if (handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if (descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (hyb == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + if (handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xhybmv"), + trans, + *alpha, + (const void*&) descr, + (const void*&) hyb, + (const void*&) x, + *beta, + (const void*&) y); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xhybmv"), + trans, + (const void*&) alpha, + (const void*&) descr, + (const void*&) hyb, + (const void*&) x, + (const void*&) beta, + (const void*&) y); + } + + // Check matrix type + if (descr->base != rocsparse_index_base_zero) + { + // TODO + return rocsparse_status_not_implemented; + } + if (descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + if (hyb->partition != rocsparse_hyb_partition_max) + { + return rocsparse_status_not_implemented; + } + + // Check sizes + if (hyb->m < 0) + { + return rocsparse_status_invalid_size; + } + else if (hyb->n < 0) + { + return rocsparse_status_invalid_size; + } + else if (hyb->ell_nnz + hyb->coo_nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check ELL-HYB structure + if (hyb->ell_nnz > 0) + { + if (hyb->ell_width < 0) + { + return rocsparse_status_invalid_size; + } + else if (hyb->ell_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (hyb->ell_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + } + + // Check COO-HYB structure + if (hyb->coo_nnz > 0) + { + if (hyb->coo_row_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (hyb->coo_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (hyb->coo_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + } + + // Check pointer arguments + if (x == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (beta == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if (hyb->m == 0 || hyb->n == 0 || + hyb->ell_nnz + hyb->coo_nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // Run different hybmv kernels + if (trans == rocsparse_operation_none) + { +#define ELLMVN_DIM 512 + dim3 ellmvn_blocks((hyb->m-1)/ELLMVN_DIM+1); + dim3 ellmvn_threads(ELLMVN_DIM); + + if (handle->pointer_mode == rocsparse_pointer_mode_device) + { + } + else + { + if (*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + + // ELL part + if (hyb->ell_nnz > 0) + { + hipLaunchKernelGGL((ellmvn_kernel_host_pointer), + ellmvn_blocks, ellmvn_threads, 0, stream, + hyb->m, hyb->n, hyb->ell_width, *alpha, + hyb->ell_col_ind, (T*) hyb->ell_val, + x, *beta, y); + } + + } +#undef ELLMVN_DIM + } + else + { + // TODO + return rocsparse_status_not_implemented; + } + return rocsparse_status_success; +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" +rocsparse_status rocsparse_shybmv(rocsparse_handle handle, + rocsparse_operation trans, + const float *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const float *x, + const float *beta, + float *y) +{ + return rocsparse_hybmv_template(handle, trans, alpha, + descr, hyb, x, beta, y); +} + +extern "C" +rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, + rocsparse_operation trans, + const double *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const double *x, + const double *beta, + double *y) +{ + return rocsparse_hybmv_template(handle, trans, alpha, + descr, hyb, x, beta, y); +} diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index f1f80592..80d719db 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -21,7 +21,7 @@ rocsparse_status rocsparse_create_handle(rocsparse_handle *handle) // Check if handle is valid if (handle == nullptr) { - return rocsparse_status_invalid_pointer; + return rocsparse_status_invalid_handle; } else { @@ -70,7 +70,7 @@ rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, // Check if handle is valid if (handle == nullptr) { - return rocsparse_status_invalid_pointer; + return rocsparse_status_invalid_handle; } handle->pointer_mode = mode; log_trace(handle, "rocsparse_set_pointer_mode", mode); @@ -87,7 +87,7 @@ rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, // Check if handle is valid if (handle == nullptr) { - return rocsparse_status_invalid_pointer; + return rocsparse_status_invalid_handle; } *mode = handle->pointer_mode; log_trace(handle, "rocsparse_get_pointer_mode", *mode); @@ -105,7 +105,7 @@ rocsparse_status rocsparse_set_stream(rocsparse_handle handle, // Check if handle is valid if (handle == nullptr) { - return rocsparse_status_invalid_pointer; + return rocsparse_status_invalid_handle; } log_trace(handle, "rocsparse_set_stream", stream_id); return handle->set_stream(stream_id); @@ -121,7 +121,7 @@ rocsparse_status rocsparse_get_stream(rocsparse_handle handle, // Check if handle is valid if (handle == nullptr) { - return rocsparse_status_invalid_pointer; + return rocsparse_status_invalid_handle; } log_trace(handle, "rocsparse_get_stream", *stream_id); return handle->get_stream(stream_id); @@ -139,7 +139,7 @@ rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version) // Check if handle is valid if (handle == nullptr) { - return rocsparse_status_invalid_pointer; + return rocsparse_status_invalid_handle; } *version = ROCSPARSE_VERSION_MAJOR * 100000 + ROCSPARSE_VERSION_MINOR * 100 @@ -265,3 +265,50 @@ rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descrA) } return descrA->type; } + +/******************************************************************************** + * \brief rocsparse_create_hyb_mat is a structure holding the rocsparse HYB + * matrix. It must be initialized using rocsparse_create_hyb_mat() + * and the retured handle must be passed to all subsequent library function + * calls that involve the HYB matrix. + * It should be destroyed at the end using rocsparse_destroy_hyb_mat(). + *******************************************************************************/ +extern "C" +rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat *hyb) +{ + if (hyb == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else + { + // Allocate + try + { + *hyb = new _rocsparse_hyb_mat; + } + catch(rocsparse_status status) + { + return status; + } + return rocsparse_status_success; + } +} + +/******************************************************************************** + * \brief Destroy HYB matrix. + *******************************************************************************/ +extern "C" +rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb) +{ + // Destruct + try + { + delete hyb; + } + catch(rocsparse_status status) + { + return status; + } + return rocsparse_status_success; +} From 1236b1e5766e1914632cde3396a5975ef199c2e0 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 7 May 2018 17:04:15 +0200 Subject: [PATCH 028/304] consistency --- library/include/rocsparse-auxiliary.h | 12 +-- library/include/rocsparse-functions.h | 74 ++++++------- library/src/level1/rocsparse_axpyi.cpp | 76 ++++++------- library/src/level2/rocsparse_csrmv.cpp | 142 ++++++++++++------------- library/src/rocsparse_auxiliary.cpp | 34 +++--- 5 files changed, 169 insertions(+), 169 deletions(-) diff --git a/library/include/rocsparse-auxiliary.h b/library/include/rocsparse-auxiliary.h index 2ad60733..d21710a3 100644 --- a/library/include/rocsparse-auxiliary.h +++ b/library/include/rocsparse-auxiliary.h @@ -79,39 +79,39 @@ rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version); * It should be destroyed at the end using rocsparse_destroy_mat_descr(). *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descrA); +rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descr); /******************************************************************************** * \brief Destroy the matrix descriptor. *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descrA); +rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr); /******************************************************************************** * \brief Set the index base of the matrix descriptor. *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descrA, +rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, rocsparse_index_base base); /******************************************************************************** * \brief Returns the index base of the matrix descriptor. *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr descrA); +rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr descr); /******************************************************************************** * \brief Set the matrix type of the matrix descriptor. *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descrA, +rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, rocsparse_matrix_type type); /******************************************************************************** * \brief Returns the matrix type of the matrix descriptor. *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descrA); +rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr); /******************************************************************************** * \brief rocsparse_create_hyb_mat is a structure holding the rocsparse HYB diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index c6bd5865..034232c5 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -38,37 +38,37 @@ ROCSPARSE_EXPORT rocsparse_status rocsparse_saxpyi(rocsparse_handle handle, rocsparse_int nnz, const float *alpha, - const float *xVal, - const rocsparse_int *xInd, + const float *x_val, + const rocsparse_int *x_ind, float *y, - rocsparse_index_base idxBase); + rocsparse_index_base idx_base); ROCSPARSE_EXPORT rocsparse_status rocsparse_daxpyi(rocsparse_handle handle, rocsparse_int nnz, const double *alpha, - const double *xVal, - const rocsparse_int *xInd, + const double *x_val, + const rocsparse_int *x_ind, double *y, - rocsparse_index_base idxBase); + rocsparse_index_base idx_base); /* ROCSPARSE_EXPORT rocsparse_status rocsparse_caxpyi(rocsparse_handle handle, rocsparse_int nnz, const rocsparse_float_complex *alpha, - const rocsparse_float_complex *xVal, - const rocsparse_int *xInd, + const rocsparse_float_complex *x_val, + const rocsparse_int *x_ind, rocsparse_float_complex *y, - rocsparse_index_base idxBase); + rocsparse_index_base idx_base); ROCSPARSE_EXPORT rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, rocsparse_int nnz, const rocsparse_double_complex *alpha, - const rocsparse_double_complex *xVal, - const rocsparse_int *xInd, + const rocsparse_double_complex *x_val, + const rocsparse_int *x_ind, rocsparse_double_complex *y, - rocsparse_index_base idxBase); + rocsparse_index_base idx_base); */ /* @@ -90,7 +90,7 @@ rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, handle rocsparse_handle. handle to the rocsparse library context queue. @param[in] - transA operation type of A. + trans operation type of A. @param[in] m number of rows of A. @param[in] @@ -100,14 +100,14 @@ rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, @param[in] alpha scalar alpha. @param[in] - descrA descriptor of A. + descr descriptor of A. @param[in] - csrValA array of nnz elements of A. + csr_val array of nnz elements of A. @param[in] - csrRowPtrA array of m+1 elements that point to the start + csr_row_ptr array of m+1 elements that point to the start of every row of A. @param[in] - csrColIndA array of nnz elements containing the column indices of A. + csr_col_ind array of nnz elements containing the column indices of A. @param[in] x array of n elements (op(A) = A) or m elements (op(A) = A^T or op(A) = A^H). @@ -120,60 +120,60 @@ rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const float *alpha, - const rocsparse_mat_descr descrA, - const float *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const float *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, const float *x, const float *beta, float *y); ROCSPARSE_EXPORT rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const double *alpha, - const rocsparse_mat_descr descrA, - const double *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const double *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, const double *x, const double *beta, double *y); /* ROCSPARSE_EXPORT rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const rocsparse_float_complex *alpha, - const rocsparse_mat_descr descrA, - const rocsparse_float_complex *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const rocsparse_float_complex *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, const rocsparse_float_complex *x, const rocsparse_float_complex *beta, rocsparse_float_complex *y); ROCSPARSE_EXPORT rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const rocsparse_double_complex *alpha, - const rocsparse_mat_descr descrA, - const rocsparse_double_complex *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const rocsparse_double_complex *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, const rocsparse_double_complex *x, const rocsparse_double_complex *beta, rocsparse_double_complex *y); diff --git a/library/src/level1/rocsparse_axpyi.cpp b/library/src/level1/rocsparse_axpyi.cpp index a4b84677..32cfef41 100644 --- a/library/src/level1/rocsparse_axpyi.cpp +++ b/library/src/level1/rocsparse_axpyi.cpp @@ -12,10 +12,10 @@ template __device__ void axpyi_device(rocsparse_int nnz, T alpha, - const T *xVal, - const rocsparse_int *xInd, + const T *x_val, + const rocsparse_int *x_ind, T *y, - rocsparse_index_base idxBase) + rocsparse_index_base idx_base) { int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -24,31 +24,31 @@ void axpyi_device(rocsparse_int nnz, return; } - y[xInd[tid]-idxBase] += alpha * xVal[tid]; + y[x_ind[tid]-idx_base] += alpha * x_val[tid]; } template __global__ void axpyi_kernel_host_scalar(rocsparse_int nnz, T alpha, - const T *xVal, - const rocsparse_int *xInd, + const T *x_val, + const rocsparse_int *x_ind, T *y, - rocsparse_index_base idxBase) + rocsparse_index_base idx_base) { - axpyi_device(nnz, alpha, xVal, xInd, y, idxBase); + axpyi_device(nnz, alpha, x_val, x_ind, y, idx_base); } template __global__ void axpyi_kernel_device_scalar(rocsparse_int nnz, const T *alpha, - const T *xVal, - const rocsparse_int *xInd, + const T *x_val, + const rocsparse_int *x_ind, T *y, - rocsparse_index_base idxBase) + rocsparse_index_base idx_base) { - axpyi_device(nnz, *alpha, xVal, xInd, y, idxBase); + axpyi_device(nnz, *alpha, x_val, x_ind, y, idx_base); } /*! \brief SPARSE Level 1 API @@ -65,23 +65,23 @@ void axpyi_kernel_device_scalar(rocsparse_int nnz, @param[in] alpha scalar alpha. @param[in] - xVal pointer storing vector x non-zero values on the GPU. + x_val pointer storing vector x non-zero values on the GPU. @param[in] - xInd pointer storing vector x non-zero value indices on the GPU. + x_ind pointer storing vector x non-zero value indices on the GPU. @param[inout] y pointer storing y on the GPU. @param[in] - idxBase specifies the index base. + idx_base specifies the index base. ********************************************************************/ template rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, rocsparse_int nnz, const T *alpha, - const T *xVal, - const rocsparse_int *xInd, + const T *x_val, + const rocsparse_int *x_ind, T *y, - rocsparse_index_base idxBase) + rocsparse_index_base idx_base) { // Check for valid handle if (handle == nullptr) @@ -93,27 +93,27 @@ rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, if (handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, - replaceX("rocsparse_axpyi"), + replaceX("rocsparse_Xaxpyi"), nnz, *alpha, - (const void*&) xVal, - (const void*&) xInd, + (const void*&) x_val, + (const void*&) x_ind, (const void*&) y); } else { log_trace(handle, - replaceX("rocsparse_axpyi"), + replaceX("rocsparse_Xaxpyi"), nnz, (const void*&) alpha, - (const void*&) xVal, - (const void*&) xInd, + (const void*&) x_val, + (const void*&) x_ind, (const void*&) y); } // Check index base - if (idxBase != rocsparse_index_base_zero && - idxBase != rocsparse_index_base_one) + if (idx_base != rocsparse_index_base_zero && + idx_base != rocsparse_index_base_one) { return rocsparse_status_invalid_value; } @@ -129,11 +129,11 @@ rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, { return rocsparse_status_invalid_pointer; } - else if (xVal == nullptr) + else if (x_val == nullptr) { return rocsparse_status_invalid_pointer; } - else if (xInd == nullptr) + else if (x_ind == nullptr) { return rocsparse_status_invalid_pointer; } @@ -159,7 +159,7 @@ rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, { hipLaunchKernelGGL((axpyi_kernel_device_scalar), axpyi_blocks, axpyi_threads, 0, stream, - nnz, alpha, xVal, xInd, y, idxBase); + nnz, alpha, x_val, x_ind, y, idx_base); } else { @@ -170,7 +170,7 @@ rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, hipLaunchKernelGGL((axpyi_kernel_host_scalar), axpyi_blocks, axpyi_threads, 0, stream, - nnz, *alpha, xVal, xInd, y, idxBase); + nnz, *alpha, x_val, x_ind, y, idx_base); } #undef AXPYI_DIM return rocsparse_status_success; @@ -186,22 +186,22 @@ extern "C" rocsparse_status rocsparse_saxpyi(rocsparse_handle handle, rocsparse_int nnz, const float *alpha, - const float *xVal, - const rocsparse_int *xInd, + const float *x_val, + const rocsparse_int *x_ind, float *y, - rocsparse_index_base idxBase) + rocsparse_index_base idx_base) { - return rocsparse_axpyi_template(handle, nnz, alpha, xVal, xInd, y, idxBase); + return rocsparse_axpyi_template(handle, nnz, alpha, x_val, x_ind, y, idx_base); } extern "C" rocsparse_status rocsparse_daxpyi(rocsparse_handle handle, rocsparse_int nnz, const double *alpha, - const double *xVal, - const rocsparse_int *xInd, + const double *x_val, + const rocsparse_int *x_ind, double *y, - rocsparse_index_base idxBase) + rocsparse_index_base idx_base) { - return rocsparse_axpyi_template(handle, nnz, alpha, xVal, xInd, y, idxBase); + return rocsparse_axpyi_template(handle, nnz, alpha, x_val, x_ind, y, idx_base); } diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 9d840e36..859f36ed 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -13,30 +13,30 @@ template ( - m, alpha, ptr, col, val, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } template __global__ void csrmvn_kernel_device_pointer(rocsparse_int m, const T *alpha, - const rocsparse_int *ptr, - const rocsparse_int *col, - const T *val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + const T *csr_val, const T *x, const T *beta, T *y) { csrmvn_general_device( - m, *alpha, ptr, col, val, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } /*! \brief SPARSE Level 2 API @@ -52,7 +52,7 @@ void csrmvn_kernel_device_pointer(rocsparse_int m, handle rocsparse_handle. handle to the rocsparse library context queue. @param[in] - transA operation type of A. + trans operation type of A. @param[in] m number of rows of A. @param[in] @@ -62,14 +62,14 @@ void csrmvn_kernel_device_pointer(rocsparse_int m, @param[in] alpha scalar alpha. @param[in] - descrA descriptor of A. + descr descriptor of A. @param[in] - csrValA array of nnz elements of A. + csr_val array of nnz elements of A. @param[in] - csrRowPtrA array of m+1 elements that point to the start + csr_row_ptr array of m+1 elements that point to the start of every row of A. @param[in] - csrColIndA array of nnz elements containing the column indices of A. + csr_col_ind array of nnz elements containing the column indices of A. @param[in] x array of n elements (op(A) = A) or m elements (op(A) = A^T or op(A) = A^H). @@ -82,15 +82,15 @@ void csrmvn_kernel_device_pointer(rocsparse_int m, ********************************************************************/ template rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const T *alpha, - const rocsparse_mat_descr descrA, - const T *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const T *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, const T *x, const T *beta, T *y) @@ -100,7 +100,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { return rocsparse_status_invalid_handle; } - else if (descrA == nullptr) + else if (descr == nullptr) { return rocsparse_status_invalid_pointer; } @@ -110,13 +110,13 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { log_trace(handle, replaceX("rocsparse_Xcsrmv"), - transA, + trans, m, n, nnz, *alpha, - (const void*&) descrA, - (const void*&) csrValA, - (const void*&) csrRowPtrA, - (const void*&) csrColIndA, + (const void*&) descr, + (const void*&) csr_val, + (const void*&) csr_row_ptr, + (const void*&) csr_col_ind, (const void*&) x, *beta, (const void*&) y); @@ -125,25 +125,25 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { log_trace(handle, replaceX("rocsparse_Xcsrmv"), - transA, + trans, m, n, nnz, (const void*&) alpha, - (const void*&) descrA, - (const void*&) csrValA, - (const void*&) csrRowPtrA, - (const void*&) csrColIndA, + (const void*&) descr, + (const void*&) csr_val, + (const void*&) csr_row_ptr, + (const void*&) csr_col_ind, (const void*&) x, (const void*&) beta, (const void*&) y); } // Check matrix type - if (descrA->base != rocsparse_index_base_zero) + if (descr->base != rocsparse_index_base_zero) { // TODO return rocsparse_status_not_implemented; } - if (descrA->type != rocsparse_matrix_type_general) + if (descr->type != rocsparse_matrix_type_general) { // TODO return rocsparse_status_not_implemented; @@ -165,15 +165,15 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } // Check pointer arguments - if (csrValA == nullptr) + if (csr_val == nullptr) { return rocsparse_status_invalid_pointer; } - else if (csrRowPtrA == nullptr) + else if (csr_row_ptr == nullptr) { return rocsparse_status_invalid_pointer; } - else if (csrColIndA == nullptr) + else if (csr_col_ind == nullptr) { return rocsparse_status_invalid_pointer; } @@ -204,7 +204,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, hipStream_t stream = handle->stream; // Run different csrmv kernels - if (transA == rocsparse_operation_none) + if (trans == rocsparse_operation_none) { #define CSRMVN_DIM 512 rocsparse_int nnz_per_row = nnz / m; @@ -220,31 +220,31 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } else if (nnz_per_row < 8) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } else if (nnz_per_row < 16) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } else if (nnz_per_row < 32) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } else { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } } else if (handle->warp_size == 64) @@ -253,37 +253,37 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } else if (nnz_per_row < 8) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } else if (nnz_per_row < 16) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } else if (nnz_per_row < 32) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } else if (nnz_per_row < 64) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } else { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csrRowPtrA, csrColIndA, csrValA, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } } else @@ -304,31 +304,31 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } else if (nnz_per_row < 8) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } else if (nnz_per_row < 16) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } else if (nnz_per_row < 32) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } else { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } } else if (handle->warp_size == 64) @@ -337,37 +337,37 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } else if (nnz_per_row < 8) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } else if (nnz_per_row < 16) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } else if (nnz_per_row < 32) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } else if (nnz_per_row < 64) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } else { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csrRowPtrA, csrColIndA, csrValA, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); } } else @@ -393,40 +393,40 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const float *alpha, - const rocsparse_mat_descr descrA, - const float *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const float *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, const float *x, const float *beta, float *y) { return rocsparse_csrmv_template( - handle, transA, m, n, nnz, alpha, descrA, - csrValA, csrRowPtrA, csrColIndA, x, beta, y); + handle, trans, m, n, nnz, alpha, descr, + csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const double *alpha, - const rocsparse_mat_descr descrA, - const double *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const double *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, const double *x, const double *beta, double *y) { return rocsparse_csrmv_template( - handle, transA, m, n, nnz, alpha, descrA, - csrValA, csrRowPtrA, csrColIndA, x, beta, y); + handle, trans, m, n, nnz, alpha, descr, + csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index 80d719db..6c0468d7 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -156,9 +156,9 @@ rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version) * It should be destroyed at the end using rocsparse_destroy_mat_descr(). *******************************************************************************/ extern "C" -rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descrA) +rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descr) { - if (descrA == nullptr) + if (descr == nullptr) { return rocsparse_status_invalid_pointer; } @@ -167,7 +167,7 @@ rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descrA) // Allocate try { - *descrA = new _rocsparse_mat_descr; + *descr = new _rocsparse_mat_descr; } catch(rocsparse_status status) { @@ -181,12 +181,12 @@ rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descrA) * \brief destroy matrix descriptor *******************************************************************************/ extern "C" -rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descrA) +rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr) { // Destruct try { - delete descrA; + delete descr; } catch(rocsparse_status status) { @@ -199,11 +199,11 @@ rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descrA) * \brief Set the index base of the matrix descriptor. *******************************************************************************/ extern "C" -rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descrA, +rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, rocsparse_index_base base) { // Check if descriptor is valid - if (descrA == nullptr) + if (descr == nullptr) { return rocsparse_status_invalid_pointer; } @@ -212,7 +212,7 @@ rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descrA, { return rocsparse_status_invalid_value; } - descrA->base = base; + descr->base = base; return rocsparse_status_success; } @@ -220,25 +220,25 @@ rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descrA, * \brief Returns the index base of the matrix descriptor. *******************************************************************************/ extern "C" -rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr descrA) +rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr descr) { // If descriptor is invalid, default index base is returned - if (descrA == nullptr) + if (descr == nullptr) { return rocsparse_index_base_zero; } - return descrA->base; + return descr->base; } /******************************************************************************** * \brief Set the matrix type of the matrix descriptor. *******************************************************************************/ extern "C" -rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descrA, +rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, rocsparse_matrix_type type) { // Check if descriptor is valid - if (descrA == nullptr) + if (descr == nullptr) { return rocsparse_status_invalid_pointer; } @@ -248,7 +248,7 @@ rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descrA, { return rocsparse_status_invalid_value; } - descrA->type = type; + descr->type = type; return rocsparse_status_success; } @@ -256,14 +256,14 @@ rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descrA, * \brief Returns the matrix type of the matrix descriptor. *******************************************************************************/ extern "C" -rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descrA) +rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr) { // If descriptor is invalid, default matrix type is returned - if (descrA == nullptr) + if (descr == nullptr) { return rocsparse_matrix_type_general; } - return descrA->type; + return descr->type; } /******************************************************************************** From f1ea4125729dc7965fc6ec5f3ba1be567aabafa2 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 7 May 2018 17:06:06 +0200 Subject: [PATCH 029/304] consistency --- .../rocsparse_template_specialization.cpp | 72 +++++++++----- clients/include/rocsparse.hpp | 26 +++-- clients/include/rocsparse_test_unique_ptr.hpp | 16 ++++ clients/include/testing_csrmv.hpp | 94 +++++++++---------- clients/samples/example_csrmv.cpp | 32 +++---- cmake/Dependencies.cmake | 8 ++ 6 files changed, 155 insertions(+), 93 deletions(-) diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index abc9f8e4..36ea3646 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -12,64 +12,92 @@ template <> rocsparse_status rocsparse_axpyi(rocsparse_handle handle, rocsparse_int nnz, const float *alpha, - const float *xVal, - const rocsparse_int *xInd, + const float *x_val, + const rocsparse_int *x_ind, float *y, - rocsparse_index_base idxBase) + rocsparse_index_base idx_base) { - return rocsparse_saxpyi(handle, nnz, alpha, xVal, xInd, y, idxBase); + return rocsparse_saxpyi(handle, nnz, alpha, x_val, x_ind, y, idx_base); } template <> rocsparse_status rocsparse_axpyi(rocsparse_handle handle, rocsparse_int nnz, const double *alpha, - const double *xVal, - const rocsparse_int *xInd, + const double *x_val, + const rocsparse_int *x_ind, double *y, - rocsparse_index_base idxBase) + rocsparse_index_base idx_base) { - return rocsparse_daxpyi(handle, nnz, alpha, xVal, xInd, y, idxBase); + return rocsparse_daxpyi(handle, nnz, alpha, x_val, x_ind, y, idx_base); } template <> rocsparse_status rocsparse_csrmv(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const float *alpha, - const rocsparse_mat_descr descrA, - const float *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const float *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, const float *x, const float *beta, float *y) { - return rocsparse_scsrmv(handle, transA, m, n, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, + return rocsparse_scsrmv(handle, trans, m, n, nnz, alpha, + descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } template <> rocsparse_status rocsparse_csrmv(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const double *alpha, - const rocsparse_mat_descr descrA, - const double *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const double *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, const double *x, const double *beta, double *y) { - return rocsparse_dcsrmv(handle, transA, m, n, nnz, alpha, - descrA, csrValA, csrRowPtrA, csrColIndA, + return rocsparse_dcsrmv(handle, trans, m, n, nnz, alpha, + descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } +template <> +rocsparse_status rocsparse_hybmv(rocsparse_handle handle, + rocsparse_operation trans, + const float *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const float *x, + const float *beta, + float *y) +{ + return rocsparse_shybmv(handle, trans, alpha, descr, + hyb, x, beta, y); +} + +template <> +rocsparse_status rocsparse_hybmv(rocsparse_handle handle, + rocsparse_operation trans, + const double *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const double *x, + const double *beta, + double *y) +{ + return rocsparse_dhybmv(handle, trans, alpha, descr, + hyb, x, beta, y); +} + } // namespace rocsparse diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index 834740f8..3fde8695 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -14,22 +14,32 @@ template rocsparse_status rocsparse_axpyi(rocsparse_handle handle, rocsparse_int nnz, const T *alpha, - const T *xVal, - const rocsparse_int *xInd, + const T *x_val, + const rocsparse_int *x_ind, T *y, - rocsparse_index_base idxBase); + rocsparse_index_base idx_base); template rocsparse_status rocsparse_csrmv(rocsparse_handle handle, - rocsparse_operation transA, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, const T *alpha, - const rocsparse_mat_descr descrA, - const T *csrValA, - const rocsparse_int *csrRowPtrA, - const rocsparse_int *csrColIndA, + const rocsparse_mat_descr descr, + const T *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + const T *x, + const T *beta, + T *y); + +template +rocsparse_status rocsparse_hybmv(rocsparse_handle handle, + rocsparse_operation trans, + const T *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, const T *x, const T *beta, T *y); diff --git a/clients/include/rocsparse_test_unique_ptr.hpp b/clients/include/rocsparse_test_unique_ptr.hpp index 2bce602a..9e06cb07 100644 --- a/clients/include/rocsparse_test_unique_ptr.hpp +++ b/clients/include/rocsparse_test_unique_ptr.hpp @@ -73,6 +73,22 @@ struct descr_struct } }; +struct hyb_struct +{ + rocsparse_hyb_mat hyb; + hyb_struct() + { + rocsparse_status status = rocsparse_create_hyb_mat(&hyb); + verify_rocsparse_status_success(status, "ERROR: hyb_struct constructor"); + } + + ~hyb_struct() + { + rocsparse_status status = rocsparse_destroy_hyb_mat(hyb); + verify_rocsparse_status_success(status, "ERROR: hyb_struct destructor"); + } +}; + } // namespace rocsparse_test using rocsparse_unique_ptr = std::unique_ptr; diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index 5918b9a3..06f1ce62 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -130,9 +130,9 @@ template rocsparse_status testing_csrmv(Arguments argus) { I safe_size = 100; - I nrow = argus.M; - I ncol = argus.N; - I nnz = argus.nnz == 32 ? nrow * 0.02 * ncol : argus.nnz; // 2% non zeros + I m = argus.M; + I n = argus.N; + I nnz = argus.nnz == 32 ? m * 0.02 * n : argus.nnz; // 2% non zeros T h_alpha = argus.alpha; T h_beta = argus.beta; op trans = argus.trans; @@ -145,7 +145,7 @@ rocsparse_status testing_csrmv(Arguments argus) rocsparse_mat_descr descr = test_descr->descr; // Argument sanity check before allocating invalid memory - if(nrow <= 0 || ncol <= 0 || nnz <= 0) + if(m <= 0 || n <= 0 || nnz <= 0) { auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), device_free}; @@ -173,30 +173,30 @@ rocsparse_status testing_csrmv(Arguments argus) CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - status = rocsparse_csrmv(handle, trans, nrow, ncol, nnz, &h_alpha, + status = rocsparse_csrmv(handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy); - if (nrow < 0 || ncol < 0 || nnz < 0) + if (m < 0 || n < 0 || nnz < 0) { - verify_rocsparse_status_invalid_size(status, "Error: nrow < 0 || " - "ncol < 0 || nnz < 0"); + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || " + "n < 0 || nnz < 0"); } else { - verify_rocsparse_status_success(status, "nrow >= 0 && ncol >= 0 && nnz >= 0"); + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); } return rocsparse_status_success; } // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - std::vector hptr(nrow+1); + std::vector hptr(m+1); std::vector hcol(nnz); std::vector hval(nnz); - std::vector hx(ncol); - std::vector hy_1(nrow); - std::vector hy_2(nrow); - std::vector hy_gold(nrow); + std::vector hx(n); + std::vector hy_1(m); + std::vector hy_2(m); + std::vector hy_gold(m); // Initial Data on CPU srand(12345ULL); @@ -207,48 +207,48 @@ rocsparse_status testing_csrmv(Arguments argus) std::vector coo_val; if (read_mtx_matrix(argus.filename.c_str(), - nrow, ncol, nnz, + m, n, nnz, coo_row, coo_col, coo_val) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; } - coo_to_csr(nrow, ncol, nnz, + coo_to_csr(m, n, nnz, coo_row, coo_col, coo_val, hptr, hcol, hval); coo_row.clear(); coo_col.clear(); coo_val.clear(); - hx.resize(ncol); - hy_1.resize(nrow); - hy_2.resize(nrow); - hy_gold.resize(nrow); + hx.resize(n); + hy_1.resize(m); + hy_2.resize(m); + hy_gold.resize(m); } else { - rocsparse_init_csr(hptr, hcol, hval, nrow, ncol, nnz); + rocsparse_init_csr(hptr, hcol, hval, m, n, nnz); } - rocsparse_init(hx, 1, ncol); - rocsparse_init(hy_1, 1, nrow); + rocsparse_init(hx, 1, n); + rocsparse_init(hy_1, 1, m); // copy vector is easy in STL; hy_gold = hx: save a copy in hy_gold which will be output of CPU hy_2 = hy_1; hy_gold = hy_1; // allocate memory on device - auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*(nrow+1)), + auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*(m+1)), device_free}; auto dcol_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*nnz), device_free}; auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nnz), device_free}; - auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*ncol), + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*n), device_free}; - auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nrow), + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*m), device_free}; - auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nrow), + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*m), device_free}; auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; @@ -272,11 +272,11 @@ rocsparse_status testing_csrmv(Arguments argus) } // copy data from CPU to device - CHECK_HIP_ERROR(hipMemcpy(dptr, hptr.data(), sizeof(I)*(nrow+1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dptr, hptr.data(), sizeof(I)*(m+1), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dcol, hcol.data(), sizeof(I)*nnz, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dval, hval.data(), sizeof(T)*nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T)*ncol, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T)*nrow, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T)*n, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T)*m, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); @@ -285,26 +285,26 @@ rocsparse_status testing_csrmv(Arguments argus) if(argus.unit_check) { - CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T)*nrow, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T)*m, hipMemcpyHostToDevice)); // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR(rocsparse_csrmv(handle, trans, nrow, ncol, nnz, &h_alpha, + CHECK_ROCSPARSE_ERROR(rocsparse_csrmv(handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); - CHECK_ROCSPARSE_ERROR(rocsparse_csrmv(handle, trans, nrow, ncol, nnz, d_alpha, + CHECK_ROCSPARSE_ERROR(rocsparse_csrmv(handle, trans, m, n, nnz, d_alpha, descr, dval, dptr, dcol, dx, d_beta, dy_2)); // copy output from device to CPU - CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T)*nrow, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T)*nrow, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T)*m, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T)*m, hipMemcpyDeviceToHost)); // CPU cpu_time_used = get_time_us(); - for (rocsparse_int i=0; i hAptr; std::vector hAcol; std::vector hAval; - int nrow = gen_2d_laplacian(ndim, hAptr, hAcol, hAval); - int ncol = nrow; - int nnz = hAptr[nrow]; + int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval); + int n = m; + int nnz = hAptr[m]; // Sample some random data srand(12345ULL); @@ -56,8 +56,8 @@ int main(int argc, char *argv[]) double halpha = (double) rand() / RAND_MAX; double hbeta = 0.0; - std::vector hx(nrow); - rocsparse_init(hx, 1, nrow); + std::vector hx(m); + rocsparse_init(hx, 1, m); // Matrix descriptor rocsparse_mat_descr descrA; @@ -70,23 +70,23 @@ int main(int argc, char *argv[]) double *dx = NULL; double *dy = NULL; - hipMalloc((void**) &dAptr, sizeof(int)*(nrow+1)); + hipMalloc((void**) &dAptr, sizeof(int)*(m+1)); hipMalloc((void**) &dAcol, sizeof(int)*nnz); hipMalloc((void**) &dAval, sizeof(double)*nnz); - hipMalloc((void**) &dx, sizeof(double)*nrow); - hipMalloc((void**) &dy, sizeof(double)*nrow); + hipMalloc((void**) &dx, sizeof(double)*m); + hipMalloc((void**) &dy, sizeof(double)*m); - hipMemcpy(dAptr, hAptr.data(), sizeof(int)*(nrow+1), hipMemcpyHostToDevice); + hipMemcpy(dAptr, hAptr.data(), sizeof(int)*(m+1), hipMemcpyHostToDevice); hipMemcpy(dAcol, hAcol.data(), sizeof(int)*nnz, hipMemcpyHostToDevice); hipMemcpy(dAval, hAval.data(), sizeof(double)*nnz, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(double)*nrow, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double)*m, hipMemcpyHostToDevice); // Warm up for (int i=0; i<10; ++i) { // Call rocsparse csrmv rocsparse_dcsrmv(handle, rocsparse_operation_none, - nrow, ncol, nnz, + m, n, nnz, &halpha, descrA, dAval, dAptr, dAcol, dx, &hbeta, dy); @@ -105,7 +105,7 @@ int main(int argc, char *argv[]) { // Call rocsparse csrmv rocsparse_dcsrmv(handle, rocsparse_operation_none, - nrow, ncol, nnz, + m, n, nnz, &halpha, descrA, dAval, dAptr, dAcol, dx, &hbeta, dy); @@ -116,12 +116,12 @@ int main(int argc, char *argv[]) } time = (get_time_us() - time) / (trials * batch_size * 1e3); - double bandwidth = static_cast(sizeof(double)*(2*nrow+nnz) - +sizeof(rocsparse_int)*(nrow+1+nnz))/time/1e6; + double bandwidth = static_cast(sizeof(double)*(2*m+nnz) + +sizeof(rocsparse_int)*(m+1+nnz))/time/1e6; double gflops = static_cast(2*nnz)/time/1e6; - printf("nrow\t\tncol\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tusec\n"); + printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tusec\n"); printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", - nrow, ncol, nnz, halpha, hbeta, gflops, bandwidth, time); + m, n, nnz, halpha, hbeta, gflops, bandwidth, time); diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index d93112e2..e80faf72 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -105,10 +105,18 @@ if(BUILD_BENCHMARK) endif() # rocPRIM package +#set(ROCPRIM_ROOT ${CMAKE_CURRENT_BINARY_DIR}/rocPRIM CACHE PATH "") #message(STATUS "Downloading rocPRIM.") #download_project(PROJ rocPRIM # GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git # GIT_TAG master +# INSTALL_DIR ${ROCPRIM_ROOT} +# CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= +# LOG_DOWNLOAD TRUE +# LOG_CONFIGURE TRUE +# LOG_INSTALL TRUE +# BUILD_PROJECT TRUE +# UPDATE_DISCONNECT TRUE #) # ROCm package From 7f10e1eba7965336e74488719fdef7d7bbffe2e1 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 8 May 2018 07:49:44 +0200 Subject: [PATCH 030/304] ellmv example --- clients/samples/CMakeLists.txt | 1 + clients/samples/example_ellmv.cpp | 143 ++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 clients/samples/example_ellmv.cpp diff --git a/clients/samples/CMakeLists.txt b/clients/samples/CMakeLists.txt index 85e56f8f..b2238fee 100644 --- a/clients/samples/CMakeLists.txt +++ b/clients/samples/CMakeLists.txt @@ -36,3 +36,4 @@ endfunction() # Examples add_rocsparse_example(example_handle.cpp) add_rocsparse_example(example_csrmv.cpp) +add_rocsparse_example(example_ellmv.cpp) diff --git a/clients/samples/example_ellmv.cpp b/clients/samples/example_ellmv.cpp new file mode 100644 index 00000000..e6c344ea --- /dev/null +++ b/clients/samples/example_ellmv.cpp @@ -0,0 +1,143 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "utility.hpp" + +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + // Parse command line + if (argc < 2) + { + fprintf(stderr, "%s [ ]\n", argv[0]); + return -1; + } + + int ndim = atoi(argv[1]); + int trials = 200; + int batch_size = 1; + + if (argc > 2) + { + trials = atoi(argv[2]); + } + if (argc > 3) + { + batch_size = atoi(argv[3]); + } + + // rocSPARSE handle + rocsparse_handle handle; + rocsparse_create_handle(&handle); + + hipDeviceProp_t devProp; + int device_id = 0; + + hipGetDevice(&device_id); + hipGetDeviceProperties(&devProp, device_id); + printf("Device: %s\n", devProp.name); + + // Generate problem in CSR format + std::vector hAptr; + std::vector hAcol; + std::vector hAval; + int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval); + int n = m; + int nnz = hAptr[m]; + + // Sample some random data + srand(12345ULL); + + double halpha = (double) rand() / RAND_MAX; + double hbeta = 0.0; + + std::vector hx(m); + rocsparse_init(hx, 1, m); + + // Matrix descriptor + rocsparse_mat_descr descrA; + rocsparse_create_mat_descr(&descrA); + + // Offload data to device + int *dAptr = NULL; + int *dAcol = NULL; + double *dAval = NULL; + double *dx = NULL; + double *dy = NULL; + + hipMalloc((void**) &dAptr, sizeof(int)*(m+1)); + hipMalloc((void**) &dAcol, sizeof(int)*nnz); + hipMalloc((void**) &dAval, sizeof(double)*nnz); + hipMalloc((void**) &dx, sizeof(double)*m); + hipMalloc((void**) &dy, sizeof(double)*m); + + hipMemcpy(dAptr, hAptr.data(), sizeof(int)*(m+1), hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(int)*nnz, hipMemcpyHostToDevice); + hipMemcpy(dAval, hAval.data(), sizeof(double)*nnz, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double)*m, hipMemcpyHostToDevice); + + // Convert CSR matrix to HYB format, using partition type to be + // rocsparse_hyb_partition_max. This will result in ELL matrix format, + // using maximum ELL width length. + rocsparse_hyb_mat hybA; + rocsparse_create_hyb_mat(&hybA); + + rocsparse_dcsr2hyb(handle, m, n, + descrA, dAval, dAptr, dAcol, + hybA, 0, rocsparse_hyb_partition_max); + + // Clean up CSR structures + hipFree(dAptr); + hipFree(dAcol); + hipFree(dAval); + + // Warm up + for (int i=0; i<10; ++i) + { + // Call rocsparse hybmv + rocsparse_dhybmv(handle, rocsparse_operation_none, + &halpha, descrA, hybA, + dx, &hbeta, dy); + } + + // Device synchronization + hipDeviceSynchronize(); + + // Start time measurement + double time = get_time_us(); + + // HYB matrix vector multiplication + for (int i=0; i(sizeof(double)*(2*m+nnz) + +sizeof(rocsparse_int)*(nnz))/time/1e6; + double gflops = static_cast(2*nnz)/time/1e6; + printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tusec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + m, n, nnz, halpha, hbeta, gflops, bandwidth, time); + + // Clean up + rocsparse_destroy_hyb_mat(hybA); + rocsparse_destroy_mat_descr(descrA); + rocsparse_destroy_handle(handle); + + return 0; +} From bd157b79ca4a8b7ababe7a6074aa77df52047ef3 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 8 May 2018 07:50:31 +0200 Subject: [PATCH 031/304] comments --- library/include/rocsparse-functions.h | 74 +++++++++++++++++---------- 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 034232c5..09aa7555 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -179,32 +179,6 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, rocsparse_double_complex *y); */ -// TODO -ROCSPARSE_EXPORT -rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, - rocsparse_int m, - rocsparse_int n, - const rocsparse_mat_descr descr, - const float *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - rocsparse_hyb_mat hyb, - rocsparse_int user_ell_width, - rocsparse_hyb_partition partition_type); - -// TODO -ROCSPARSE_EXPORT -rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, - rocsparse_int m, - rocsparse_int n, - const rocsparse_mat_descr descr, - const double *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - rocsparse_hyb_mat hyb, - rocsparse_int user_ell_width, - rocsparse_hyb_partition partition_type); - // TODO ROCSPARSE_EXPORT rocsparse_status rocsparse_shybmv(rocsparse_handle handle, @@ -233,6 +207,54 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, * =========================================================================== */ + + + + + + +/* + * =========================================================================== + * Sparse format conversions + * =========================================================================== + */ + +// TODO +ROCSPARSE_EXPORT +rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, + const rocsparse_int *coo_row_ind, + rocsparse_int nnz, + rocsparse_int m, + rocsparse_int *csr_row_ptr, + rocsparse_index_base idx_base); + +// TODO +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const float *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type); + +// TODO +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const double *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type); + + #ifdef __cplusplus } #endif From d83730b1a607c03ddc9fd09ab00a721bf7603462 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 8 May 2018 12:34:57 +0200 Subject: [PATCH 032/304] device ell conversion kernel and error checks added --- library/src/conversion/csr2hyb_device.h | 86 +++++++++++++++++++- library/src/conversion/rocsparse_csr2hyb.cpp | 74 +++++++---------- 2 files changed, 115 insertions(+), 45 deletions(-) diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index fb6ff48e..26c2d3f7 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -10,9 +10,91 @@ #include -template +template __device__ -void csr2ell_device(rocsparse_int m, +void ell_width_reduce(rocsparse_int tid, rocsparse_int *data) +{ + __syncthreads(); + + for (int i=NB>>1; i>0; i>>=1) + { + if (tid < i) + { + data[tid] = max(data[tid], data[tid+i]); + } + + __syncthreads(); + } +} + +template +__global__ +void ell_width_kernel_part1(rocsparse_int m, + const rocsparse_int *csr_row_ptr, + rocsparse_int *workspace) +{ + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + __shared__ rocsparse_int sdata[NB]; + + if (gid < m) + { + sdata[tid] = csr_row_ptr[gid+1] - csr_row_ptr[gid]; + } + else + { + sdata[tid] = 0; + } + + ell_width_reduce(tid, sdata); + + if (tid == 0) + { + workspace[hipBlockIdx_x] = sdata[0]; + } +} + +template +__global__ +void ell_width_kernel_part2(rocsparse_int m, rocsparse_int *workspace) +{ + rocsparse_int tid = hipThreadIdx_x; + + __shared__ rocsparse_int sdata[NB]; + sdata[tid] = 0; + + for (rocsparse_int i=tid; i sdata[tid]) ? workspace[i] : sdata[tid]; + } + + __syncthreads(); + + if (m < 32) + { + if (tid == 0) + { + for (rocsparse_int i=1; i sdata[0]) ? sdata[i] : sdata[0]; + } + } + } + else + { + ell_width_reduce(tid, sdata); + } + + if (tid == 0) + { + workspace[0] = sdata[0]; + } +} + +template +__global__ +void csr2ell_kernel(rocsparse_int m, const T *csr_val, const rocsparse_int *csr_row_ptr, const rocsparse_int *csr_col_ind, diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp index 08ed46ee..d97a98ab 100644 --- a/library/src/conversion/rocsparse_csr2hyb.cpp +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -3,26 +3,13 @@ * ************************************************************************ */ #include "rocsparse.h" +#include "definitions.h" #include "handle.h" #include "utility.h" #include "csr2hyb_device.h" #include -template -__global__ -void csr2ell_kernel(rocsparse_int m, - const T *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - rocsparse_int ell_width, - rocsparse_int *ell_col_ind, - T *ell_val) -{ - csr2ell_device(m, csr_val, csr_row_ptr, csr_col_ind, - ell_width, ell_col_ind, ell_val); -} - template rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, rocsparse_int m, @@ -121,45 +108,51 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, if (hyb->ell_col_ind) { - hipFree(hyb->ell_col_ind); + RETURN_IF_HIP_ERROR(hipFree(hyb->ell_col_ind)); } if (hyb->ell_val) { - hipFree(hyb->ell_val); + RETURN_IF_HIP_ERROR(hipFree(hyb->ell_val)); } if (hyb->coo_row_ind) { - hipFree(hyb->coo_row_ind); + RETURN_IF_HIP_ERROR(hipFree(hyb->coo_row_ind)); } if (hyb->coo_col_ind) { - hipFree(hyb->coo_col_ind); + RETURN_IF_HIP_ERROR(hipFree(hyb->coo_col_ind)); } if (hyb->coo_val) { - hipFree(hyb->coo_val); + RETURN_IF_HIP_ERROR(hipFree(hyb->coo_val)); } -#define CSR2ELL_DIM 256 - dim3 csr2ell_blocks((m-1)/CSR2ELL_DIM+1); - dim3 csr2ell_threads(CSR2ELL_DIM); - +#define CSR2ELL_DIM 512 //TODO we take max partition if (partition_type == rocsparse_hyb_partition_max) { // ELL part only, compute maximum non-zeros per row + rocsparse_int blocks = handle->warp_size; + + // Allocate workspace + rocsparse_int *workspace = NULL; + RETURN_IF_HIP_ERROR(hipMalloc((void**) &workspace, + sizeof(rocsparse_int)*blocks)); - //TODO reduction with rocPRIM to compute maxrow - rocsparse_int *hbuf = (rocsparse_int*) malloc(sizeof(rocsparse_int)*(m+1)); - hipMemcpy(hbuf, csr_row_ptr, sizeof(rocsparse_int)*(m+1), hipMemcpyDeviceToHost); + hipLaunchKernelGGL((ell_width_kernel_part1), + dim3(blocks), dim3(CSR2ELL_DIM), 0, stream, + m, csr_row_ptr, workspace); - for (rocsparse_int i=0; iell_width = rownnz > hyb->ell_width ? rownnz : hyb->ell_width; - } - free(hbuf); - // END TODO + hipLaunchKernelGGL((ell_width_kernel_part2), + dim3(1), dim3(CSR2ELL_DIM), 0, stream, + blocks, workspace); + + // Copy ell width back to host + RETURN_IF_HIP_ERROR(hipMemcpy(&hyb->ell_width, + workspace, + sizeof(rocsparse_int), + hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR(hipFree(workspace)); } else { @@ -171,24 +164,19 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hyb->ell_nnz = hyb->ell_width * m; // Allocate ELL part - hipMalloc((void**) &hyb->ell_col_ind, sizeof(rocsparse_int)*hyb->ell_nnz); - hipMalloc(&hyb->ell_val, sizeof(T)*hyb->ell_nnz); - + RETURN_IF_HIP_ERROR(hipMalloc((void**) &hyb->ell_col_ind, + sizeof(rocsparse_int)*hyb->ell_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T)*hyb->ell_nnz)); + dim3 csr2ell_blocks((m-1)/CSR2ELL_DIM+1); + dim3 csr2ell_threads(CSR2ELL_DIM); hipLaunchKernelGGL((csr2ell_kernel), csr2ell_blocks, csr2ell_threads, 0, stream, m, csr_val, csr_row_ptr, csr_col_ind, hyb->ell_width, hyb->ell_col_ind, (T*) hyb->ell_val); - #undef CSR2ELL_DIM - - - - - - return rocsparse_status_success; } From a58231148de00950f9bc4cd3861230b332738148 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 8 May 2018 14:49:04 +0200 Subject: [PATCH 033/304] csr2coo and coo2csr --- library/include/rocsparse-functions.h | 9 ++ library/src/CMakeLists.txt | 2 + library/src/conversion/coo2csr_device.h | 59 +++++++ library/src/conversion/csr2coo_device.h | 31 ++++ library/src/conversion/rocsparse_coo2csr.cpp | 89 +++++++++++ library/src/conversion/rocsparse_csr2coo.cpp | 160 +++++++++++++++++++ 6 files changed, 350 insertions(+) create mode 100644 library/src/conversion/coo2csr_device.h create mode 100644 library/src/conversion/csr2coo_device.h create mode 100644 library/src/conversion/rocsparse_coo2csr.cpp create mode 100644 library/src/conversion/rocsparse_csr2coo.cpp diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 09aa7555..fb74373e 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -219,6 +219,15 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, * =========================================================================== */ +// TODO +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, + const rocsparse_int *csr_row_ptr, + rocsparse_int nnz, + rocsparse_int m, + rocsparse_int *coo_row_ind, + rocsparse_index_base idx_base); + // TODO ROCSPARSE_EXPORT rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index d25ff4ef..f780787a 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -11,4 +11,6 @@ set(rocsparse_source src/level2/rocsparse_csrmv.cpp src/level2/rocsparse_hybmv.cpp src/conversion/rocsparse_csr2hyb.cpp + src/conversion/rocsparse_csr2coo.cpp + src/conversion/rocsparse_coo2csr.cpp ) diff --git a/library/src/conversion/coo2csr_device.h b/library/src/conversion/coo2csr_device.h new file mode 100644 index 00000000..8a7bf2de --- /dev/null +++ b/library/src/conversion/coo2csr_device.h @@ -0,0 +1,59 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef COO2CSR_DEVICE_H +#define COO2CSR_DEVICE_H + +#include + +__device__ +rocsparse_int lower_bound(const rocsparse_int *arr, + rocsparse_int key, + rocsparse_int low, + rocsparse_int high) +{ + if (low > high) + { + return low; + } + + rocsparse_int mid = low + ((high - low) >> 1); + + if (arr[mid] >= key) + { + high = mid - 1; + } + else + { + low = mid + 1; + } + return lower_bound(arr, key, low, high); +} + +__global__ +void coo2csr_kernel(rocsparse_int m, + rocsparse_int nnz, + const rocsparse_int *coo_row_ind, + rocsparse_int *csr_row_ptr) +{ + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if (gid >= m) + { + return; + } + + if (gid == 0) + { + csr_row_ptr[0] = 0; + csr_row_ptr[m] = nnz; + return; + } + + // Binary search + csr_row_ptr[gid] = lower_bound(coo_row_ind, gid, 0, nnz-1); +} + +#endif // COO2CSR_DEVICE_H diff --git a/library/src/conversion/csr2coo_device.h b/library/src/conversion/csr2coo_device.h new file mode 100644 index 00000000..b82320cf --- /dev/null +++ b/library/src/conversion/csr2coo_device.h @@ -0,0 +1,31 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef CSR2COO_DEVICE_H +#define CSR2COO_DEVICE_H + +#include + +template +__global__ +void csr2coo_kernel(rocsparse_int m, + const rocsparse_int *csr_row_ptr, + rocsparse_int *coo_row_ind) +{ + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int lid = hipThreadIdx_x % THREADS; + rocsparse_int vid = gid / THREADS; + rocsparse_int nvec = hipGridDim_x * hipBlockDim_x / THREADS; + + for(rocsparse_int ai=vid; ai + +extern "C" +rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, + const rocsparse_int *coo_row_ind, + rocsparse_int nnz, + rocsparse_int m, + rocsparse_int *csr_row_ptr, + rocsparse_index_base idx_base) +{ + // Check for valid handle + if (handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_coo2csr", + (const void*&) coo_row_ind, + nnz, + m, + (const void*&) csr_row_ptr, + idx_base); + + // Check matrix parameters + if (idx_base != rocsparse_index_base_zero) + { + //TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if (nnz < 0) + { + return rocsparse_status_invalid_size; + } + else if (m < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if (coo_row_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if (nnz == 0 || m == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // TODO + +#define COO2CSR_DIM 512 + dim3 coo2csr_blocks((m-1)/COO2CSR_DIM+1); + dim3 coo2csr_threads(COO2CSR_DIM); + + hipLaunchKernelGGL((coo2csr_kernel), + coo2csr_blocks, coo2csr_threads, 0, stream, + m, nnz, coo_row_ind, csr_row_ptr); + + + + +#undef COO2CSR_DIM + + + + return rocsparse_status_success; +} diff --git a/library/src/conversion/rocsparse_csr2coo.cpp b/library/src/conversion/rocsparse_csr2coo.cpp new file mode 100644 index 00000000..d5ecc272 --- /dev/null +++ b/library/src/conversion/rocsparse_csr2coo.cpp @@ -0,0 +1,160 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "csr2coo_device.h" + +#include + +extern "C" +rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, + const rocsparse_int *csr_row_ptr, + rocsparse_int nnz, + rocsparse_int m, + rocsparse_int *coo_row_ind, + rocsparse_index_base idx_base) +{ + // Check for valid handle + if (handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_csr2coo", + (const void*&) csr_row_ptr, + nnz, + m, + (const void*&) coo_row_ind, + idx_base); + + // Check matrix parameters + if (idx_base != rocsparse_index_base_zero) + { + //TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if (nnz < 0) + { + return rocsparse_status_invalid_size; + } + else if (m < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if (csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if (coo_row_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if (nnz == 0 || m == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // TODO + + + +#define CSR2COO_DIM 512 + rocsparse_int nnz_per_row = nnz / m; + + dim3 csr2coo_blocks((m-1)/CSR2COO_DIM+1); + dim3 csr2coo_threads(CSR2COO_DIM); + + if (handle->warp_size == 32) + { + if (nnz_per_row < 4) + { + hipLaunchKernelGGL((csr2coo_kernel<2>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + else if (nnz_per_row < 8) + { + hipLaunchKernelGGL((csr2coo_kernel<4>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + else if (nnz_per_row < 16) + { + hipLaunchKernelGGL((csr2coo_kernel<8>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + else if (nnz_per_row < 32) + { + hipLaunchKernelGGL((csr2coo_kernel<16>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + else + { + hipLaunchKernelGGL((csr2coo_kernel<32>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + } + else if (handle->warp_size == 64) + { + if (nnz_per_row < 4) + { + hipLaunchKernelGGL((csr2coo_kernel<2>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + else if (nnz_per_row < 8) + { + hipLaunchKernelGGL((csr2coo_kernel<4>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + else if (nnz_per_row < 16) + { + hipLaunchKernelGGL((csr2coo_kernel<8>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + else if (nnz_per_row < 32) + { + hipLaunchKernelGGL((csr2coo_kernel<16>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + else if (nnz_per_row < 64) + { + hipLaunchKernelGGL((csr2coo_kernel<32>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + else + { + hipLaunchKernelGGL((csr2coo_kernel<64>), + csr2coo_blocks, csr2coo_threads, 0, stream, + m, csr_row_ptr, coo_row_ind); + } + } + else + { + return rocsparse_status_arch_mismatch; + } +#undef CSR2COO_DIM + + return rocsparse_status_success; +} From ac6e926edff26ed80a682260cabbede0277d0011 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 8 May 2018 16:41:34 +0200 Subject: [PATCH 034/304] added sparse matrix generator (COO) --- clients/include/utility.hpp | 80 +++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index 6726ddcc..71697410 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -213,6 +213,86 @@ rocsparse_int gen_2d_laplacian(rocsparse_int ndim, return n; } +/* ============================================================================================ */ +/*! \brief Generate a random sparse matrix in COO format */ +template +rocsparse_int gen_matrix_coo(rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + std::vector &row_ind, + std::vector &col_ind, + std::vector &val) +{ + if (row_ind.size() != nnz) + { + row_ind.resize(nnz); + } + if (col_ind.size() != nnz) + { + col_ind.resize(nnz); + } + if (val.size() != nnz) + { + val.resize(nnz); + } + + // Uniform distributed row indices + for (rocsparse_int i=0; i check(nnz, false); + + rocsparse_int i=0; + while (i < nnz) + { + rocsparse_int begin = i; + while (row_ind[i] == row_ind[begin]) + ++i; + + // Sample i disjunct column indices + rocsparse_int idx = begin; + while (idx < i) + { + // Normal distribution around the diagonal + rocsparse_int rng = row_ind[begin] + (i - begin) + * sqrt(-2.0 * log((double) rand() / RAND_MAX)) + * cos(2.0 * M_PI * (double) rand() / RAND_MAX); + + // Repeat if running out of bounds + if (rng < 0 || rng > n-1) + continue; + + // Check for disjunct column index in current row + if (!check[rng]) + { + check[rng] = true; + col_ind[idx] = rng; + ++idx; + } + } + + // Reset disjunct check array + for (rocsparse_int j=begin; j From e99d0ab3ced3e4734c8c0f984134427b742002aa Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 9 May 2018 12:04:07 +0200 Subject: [PATCH 035/304] coo2csr: index_base_one support --- library/src/conversion/coo2csr_device.h | 9 +++++---- library/src/conversion/rocsparse_coo2csr.cpp | 18 +----------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/library/src/conversion/coo2csr_device.h b/library/src/conversion/coo2csr_device.h index 8a7bf2de..761c0873 100644 --- a/library/src/conversion/coo2csr_device.h +++ b/library/src/conversion/coo2csr_device.h @@ -36,7 +36,8 @@ __global__ void coo2csr_kernel(rocsparse_int m, rocsparse_int nnz, const rocsparse_int *coo_row_ind, - rocsparse_int *csr_row_ptr) + rocsparse_int *csr_row_ptr, + rocsparse_index_base idx_base) { rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -47,13 +48,13 @@ void coo2csr_kernel(rocsparse_int m, if (gid == 0) { - csr_row_ptr[0] = 0; - csr_row_ptr[m] = nnz; + csr_row_ptr[0] = idx_base; + csr_row_ptr[m] = nnz + idx_base; return; } // Binary search - csr_row_ptr[gid] = lower_bound(coo_row_ind, gid, 0, nnz-1); + csr_row_ptr[gid] = lower_bound(coo_row_ind, gid+idx_base, 0, nnz-1) + idx_base; } #endif // COO2CSR_DEVICE_H diff --git a/library/src/conversion/rocsparse_coo2csr.cpp b/library/src/conversion/rocsparse_coo2csr.cpp index 7be74758..0a1f6cc1 100644 --- a/library/src/conversion/rocsparse_coo2csr.cpp +++ b/library/src/conversion/rocsparse_coo2csr.cpp @@ -32,13 +32,6 @@ rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, (const void*&) csr_row_ptr, idx_base); - // Check matrix parameters - if (idx_base != rocsparse_index_base_zero) - { - //TODO - return rocsparse_status_not_implemented; - } - // Check sizes if (nnz < 0) { @@ -68,22 +61,13 @@ rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, // Stream hipStream_t stream = handle->stream; - // TODO - #define COO2CSR_DIM 512 dim3 coo2csr_blocks((m-1)/COO2CSR_DIM+1); dim3 coo2csr_threads(COO2CSR_DIM); hipLaunchKernelGGL((coo2csr_kernel), coo2csr_blocks, coo2csr_threads, 0, stream, - m, nnz, coo_row_ind, csr_row_ptr); - - - - + m, nnz, coo_row_ind, csr_row_ptr, idx_base); #undef COO2CSR_DIM - - - return rocsparse_status_success; } From 6d8c31383cbcea1c6f2e299ae0779f847a83d65f Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 9 May 2018 12:05:09 +0200 Subject: [PATCH 036/304] gen_matrix_coo: index_base_one support --- clients/include/utility.hpp | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index 71697410..d1bae037 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -216,12 +216,13 @@ rocsparse_int gen_2d_laplacian(rocsparse_int ndim, /* ============================================================================================ */ /*! \brief Generate a random sparse matrix in COO format */ template -rocsparse_int gen_matrix_coo(rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - std::vector &row_ind, - std::vector &col_ind, - std::vector &val) +void gen_matrix_coo(rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + std::vector &row_ind, + std::vector &col_ind, + std::vector &val, + rocsparse_index_base idx_base) { if (row_ind.size() != nnz) { @@ -260,10 +261,15 @@ rocsparse_int gen_matrix_coo(rocsparse_int m, while (idx < i) { // Normal distribution around the diagonal - rocsparse_int rng = row_ind[begin] + (i - begin) + rocsparse_int rng = (i - begin) * sqrt(-2.0 * log((double) rand() / RAND_MAX)) * cos(2.0 * M_PI * (double) rand() / RAND_MAX); + if (m <= n) + { + rng += row_ind[begin]; + } + // Repeat if running out of bounds if (rng < 0 || rng > n-1) continue; @@ -285,6 +291,16 @@ rocsparse_int gen_matrix_coo(rocsparse_int m, std::sort(&col_ind[begin], &col_ind[i]); } + // Correct index base accordingly + if (idx_base == rocsparse_index_base_one) + { + for (rocsparse_int i=0; i Date: Wed, 9 May 2018 12:06:09 +0200 Subject: [PATCH 037/304] coo2csr test/benchmark added --- clients/benchmarks/client.cpp | 7 +- clients/include/testing_coo2csr.hpp | 215 ++++++++++++++++++++++++++++ clients/tests/CMakeLists.txt | 31 ++-- clients/tests/test_coo2csr.cpp | 53 +++++++ 4 files changed, 295 insertions(+), 11 deletions(-) create mode 100644 clients/include/testing_coo2csr.hpp create mode 100644 clients/tests/test_coo2csr.cpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 30482a40..fd0f642e 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -6,6 +6,7 @@ #include "rocsparse.hpp" #include "testing_csrmv.hpp" #include "testing_axpyi.hpp" +#include "testing_coo2csr.hpp" #include #include @@ -54,7 +55,7 @@ int main(int argc, char *argv[]) ("function,f", po::value(&function)->default_value("axpyi"), - "SPARSE function to test. Options: axpyi, csrmv") + "SPARSE function to test. Options: axpyi, csrmv, coo2csr") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") @@ -122,6 +123,10 @@ int main(int argc, char *argv[]) else if (precision == 'd') testing_csrmv(argus); } + else if (function == "coo2csr") + { + testing_coo2csr(argus); + } else { fprintf(stderr, "Invalid value for --function\n"); diff --git a/clients/include/testing_coo2csr.hpp b/clients/include/testing_coo2csr.hpp new file mode 100644 index 00000000..311937b8 --- /dev/null +++ b/clients/include/testing_coo2csr.hpp @@ -0,0 +1,215 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_COO2CSR_HPP +#define TESTING_COO2CSR_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +void testing_coo2csr_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto coo_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + + rocsparse_int *coo_row_ind = (rocsparse_int*) coo_row_ind_managed.get(); + rocsparse_int *csr_row_ptr = (rocsparse_int*) csr_row_ptr_managed.get(); + + if (!coo_row_ind || + !csr_row_ptr) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // Testing for (coo_row_ind == nullptr) + { + rocsparse_int *coo_row_ind_null = nullptr; + status = rocsparse_coo2csr(handle, coo_row_ind_null, nnz, m, + csr_row_ptr, rocsparse_index_base_zero); + verify_rocsparse_status_invalid_pointer(status, "Error: coo_row_ind is nullptr"); + } + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int *csr_row_ptr_null = nullptr; + status = rocsparse_coo2csr(handle, coo_row_ind, nnz, m, + csr_row_ptr_null, rocsparse_index_base_zero); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + status = rocsparse_coo2csr(handle_null, coo_row_ind, nnz, m, + csr_row_ptr, rocsparse_index_base_zero); + verify_rocsparse_status_invalid_handle(status); + } +} + +rocsparse_status testing_coo2csr(Arguments argus) +{ + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + double scale = 0.02; + if (m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + // Argument sanity check before allocating invalid memory + if (m <= 0 || n <= 0 || nnz <= 0) + { + auto coo_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + + rocsparse_int *coo_row_ind = (rocsparse_int*) coo_row_ind_managed.get(); + rocsparse_int *csr_row_ptr = (rocsparse_int*) csr_row_ptr_managed.get(); + + if (!coo_row_ind || + !csr_row_ptr) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!coo_row_ind || !csr_row_ptr"); + return rocsparse_status_memory_error; + } + + status = rocsparse_coo2csr(handle, coo_row_ind, nnz, m, csr_row_ptr, idx_base); + + if (m < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || " + "nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hcoo_row_ind(nnz); + std::vector hcoo_col_ind(nnz); + std::vector hcoo_val(nnz); + std::vector hcsr_row_ptr(m+1); + std::vector hcsr_row_ptr_gold(m+1, 0); + + // Sample initial COO matrix on CPU + srand(12345ULL); + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base); + + // Allocate memory on the device + auto dcoo_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*nnz), + device_free}; + auto dcsr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*(m+1)), + device_free}; + + rocsparse_int *dcoo_row_ind = (rocsparse_int*) dcoo_row_ind_managed.get(); + rocsparse_int *dcsr_row_ptr = (rocsparse_int*) dcsr_row_ptr_managed.get(); + + if (!dcoo_row_ind || !dcsr_row_ptr) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dcoo_row_ind || !dcsr_row_ptr"); + return rocsparse_status_memory_error; + } + + // Copy data from host to device + CHECK_HIP_ERROR(hipMemcpy(dcoo_row_ind, hcoo_row_ind.data(), + sizeof(rocsparse_int)*nnz, hipMemcpyHostToDevice)); + + if (argus.unit_check) + { + CHECK_ROCSPARSE_ERROR(rocsparse_coo2csr(handle, dcoo_row_ind, nnz, + m, dcsr_row_ptr, idx_base)); + + // Copy output from device to host + CHECK_HIP_ERROR(hipMemcpy(hcsr_row_ptr.data(), dcsr_row_ptr, + sizeof(rocsparse_int)*(m+1), hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + // coo2csr on host + for (int i=0; i ) -target_link_libraries(rocsparse-test - PRIVATE - roc::rocsparse - ${GTEST_BOTH_LIBRARIES} - hip::hip_hcc - hip::hip_device -) +if(HIP_PLATFORM STREQUAL "hcc") + target_link_libraries(rocsparse-test + PRIVATE + roc::rocsparse + ${GTEST_BOTH_LIBRARIES} + hip::hip_hcc + hip::hip_device + ) + + foreach(amdgpu_target ${AMDGPU_TARGETS}) + target_link_libraries(rocsparse-test + PRIVATE + --amdgpu-target=${amdgpu_target} + ) + endforeach() +endif() -foreach(amdgpu_target ${AMDGPU_TARGETS}) +if(HIP_PLATFORM STREQUAL "nvcc") target_link_libraries(rocsparse-test PRIVATE - --amdgpu-target=${amdgpu_target} + roc::rocsparse + ${GTEST_BOTH_LIBRARIES} ) -endforeach() +endif() add_test(rocsparse-test rocsparse-test) diff --git a/clients/tests/test_coo2csr.cpp b/clients/tests/test_coo2csr.cpp new file mode 100644 index 00000000..d6c6bc27 --- /dev/null +++ b/clients/tests/test_coo2csr.cpp @@ -0,0 +1,53 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_coo2csr.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef std::tuple coo2csr_tuple; + +int coo_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int coo_N_range[] = {-3, 0, 33, 242, 623, 1000}; +rocsparse_index_base coo_idx_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; + +class parameterized_coo2csr : public testing::TestWithParam +{ + protected: + parameterized_coo2csr() {} + virtual ~parameterized_coo2csr() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_coo2csr_arguments(coo2csr_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; + return arg; +} + +TEST(coo2csr_bad_arg, coo2csr) +{ + testing_coo2csr_bad_arg(); +} + +TEST_P(parameterized_coo2csr, coo2csr) +{ + Arguments arg = setup_coo2csr_arguments(GetParam()); + rocsparse_status status = testing_coo2csr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(coo2csr, parameterized_coo2csr, + testing::Combine(testing::ValuesIn(coo_M_range), + testing::ValuesIn(coo_N_range), + testing::ValuesIn(coo_idx_base_range))); From 151a27eb3e77f54a16a6d3dd25c22d930388d716 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 9 May 2018 12:53:24 +0200 Subject: [PATCH 038/304] csr2coo: index_base_one support --- library/src/conversion/csr2coo_device.h | 5 +-- library/src/conversion/rocsparse_csr2coo.cpp | 34 +++++++------------- 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/library/src/conversion/csr2coo_device.h b/library/src/conversion/csr2coo_device.h index b82320cf..06ff4651 100644 --- a/library/src/conversion/csr2coo_device.h +++ b/library/src/conversion/csr2coo_device.h @@ -12,7 +12,8 @@ template __global__ void csr2coo_kernel(rocsparse_int m, const rocsparse_int *csr_row_ptr, - rocsparse_int *coo_row_ind) + rocsparse_int *coo_row_ind, + rocsparse_index_base idx_base) { rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; rocsparse_int lid = hipThreadIdx_x % THREADS; @@ -23,7 +24,7 @@ void csr2coo_kernel(rocsparse_int m, { for(rocsparse_int aj=csr_row_ptr[ai]+lid; ajstream; - // TODO - - - #define CSR2COO_DIM 512 rocsparse_int nnz_per_row = nnz / m; @@ -84,31 +73,31 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, { hipLaunchKernelGGL((csr2coo_kernel<2>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } else if (nnz_per_row < 8) { hipLaunchKernelGGL((csr2coo_kernel<4>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } else if (nnz_per_row < 16) { hipLaunchKernelGGL((csr2coo_kernel<8>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } else if (nnz_per_row < 32) { hipLaunchKernelGGL((csr2coo_kernel<16>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } else { hipLaunchKernelGGL((csr2coo_kernel<32>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } } else if (handle->warp_size == 64) @@ -117,37 +106,37 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, { hipLaunchKernelGGL((csr2coo_kernel<2>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } else if (nnz_per_row < 8) { hipLaunchKernelGGL((csr2coo_kernel<4>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } else if (nnz_per_row < 16) { hipLaunchKernelGGL((csr2coo_kernel<8>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } else if (nnz_per_row < 32) { hipLaunchKernelGGL((csr2coo_kernel<16>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } else if (nnz_per_row < 64) { hipLaunchKernelGGL((csr2coo_kernel<32>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } else { hipLaunchKernelGGL((csr2coo_kernel<64>), csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind); + m, csr_row_ptr, coo_row_ind, idx_base); } } else @@ -155,6 +144,5 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, return rocsparse_status_arch_mismatch; } #undef CSR2COO_DIM - return rocsparse_status_success; } From 801649f22521fbe3dca0e9bb51fd74f85391165f Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 9 May 2018 12:53:46 +0200 Subject: [PATCH 039/304] csr2coo test/benchmark --- clients/benchmarks/client.cpp | 7 +- clients/include/testing_csr2coo.hpp | 214 ++++++++++++++++++++++++++++ clients/tests/CMakeLists.txt | 1 + clients/tests/test_csr2coo.cpp | 53 +++++++ 4 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 clients/include/testing_csr2coo.hpp create mode 100644 clients/tests/test_csr2coo.cpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index fd0f642e..baeb6a4f 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -6,6 +6,7 @@ #include "rocsparse.hpp" #include "testing_csrmv.hpp" #include "testing_axpyi.hpp" +#include "testing_csr2coo.hpp" #include "testing_coo2csr.hpp" #include @@ -55,7 +56,7 @@ int main(int argc, char *argv[]) ("function,f", po::value(&function)->default_value("axpyi"), - "SPARSE function to test. Options: axpyi, csrmv, coo2csr") + "SPARSE function to test. Options: axpyi, csrmv, csr2coo, coo2csr") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") @@ -123,6 +124,10 @@ int main(int argc, char *argv[]) else if (precision == 'd') testing_csrmv(argus); } + else if (function == "csr2coo") + { + testing_csr2coo(argus); + } else if (function == "coo2csr") { testing_coo2csr(argus); diff --git a/clients/include/testing_csr2coo.hpp b/clients/include/testing_csr2coo.hpp new file mode 100644 index 00000000..9478ee68 --- /dev/null +++ b/clients/include/testing_csr2coo.hpp @@ -0,0 +1,214 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_CSR2COO_HPP +#define TESTING_CSR2COO_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +void testing_csr2coo_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + auto coo_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + + rocsparse_int *csr_row_ptr = (rocsparse_int*) csr_row_ptr_managed.get(); + rocsparse_int *coo_row_ind = (rocsparse_int*) coo_row_ind_managed.get(); + + if (!csr_row_ptr || + !coo_row_ind) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int *csr_row_ptr_null = nullptr; + status = rocsparse_csr2coo(handle, csr_row_ptr_null, nnz, m, + coo_row_ind, rocsparse_index_base_zero); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + // Testing for (coo_row_ind == nullptr) + { + rocsparse_int *coo_row_ind_null = nullptr; + status = rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, + coo_row_ind_null, rocsparse_index_base_zero); + verify_rocsparse_status_invalid_pointer(status, "Error: coo_row_ind is nullptr"); + } + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + status = rocsparse_csr2coo(handle_null, csr_row_ptr, nnz, m, + coo_row_ind, rocsparse_index_base_zero); + verify_rocsparse_status_invalid_handle(status); + } +} + +rocsparse_status testing_csr2coo(Arguments argus) +{ + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + double scale = 0.02; + if (m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + // Argument sanity check before allocating invalid memory + if (m <= 0 || n <= 0 || nnz <= 0) + { + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + auto coo_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + + rocsparse_int *csr_row_ptr = (rocsparse_int*) csr_row_ptr_managed.get(); + rocsparse_int *coo_row_ind = (rocsparse_int*) coo_row_ind_managed.get(); + + if (!csr_row_ptr || + !coo_row_ind) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!csr_row_ptr || !coo_row_ind"); + return rocsparse_status_memory_error; + } + + status = rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, coo_row_ind, idx_base); + + if (m < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || " + "nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + } + + return rocsparse_status_success; + } + + // For testing, assemble a COO matrix and convert it to CSR first (on host) + + // Host structures + std::vector hcoo_row_ind(nnz); + std::vector hcoo_row_ind_gold(nnz); + std::vector hcoo_col_ind(nnz); + std::vector hcoo_val(nnz); + + // Sample initial COO matrix on CPU + srand(12345ULL); + gen_matrix_coo(m, n, nnz, hcoo_row_ind_gold, hcoo_col_ind, hcoo_val, idx_base); + + // Convert COO to CSR + std::vector hcsr_row_ptr(m+1); + + // csr2coo on host + for (int i=0; i +#include +#include + +typedef std::tuple csr2coo_tuple; + +int coo_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int coo_N_range[] = {-3, 0, 33, 242, 623, 1000}; +rocsparse_index_base coo_idx_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; + +class parameterized_csr2coo : public testing::TestWithParam +{ + protected: + parameterized_csr2coo() {} + virtual ~parameterized_csr2coo() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csr2coo_arguments(csr2coo_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; + return arg; +} + +TEST(csr2coo_bad_arg, csr2coo) +{ + testing_csr2coo_bad_arg(); +} + +TEST_P(parameterized_csr2coo, csr2coo) +{ + Arguments arg = setup_csr2coo_arguments(GetParam()); + rocsparse_status status = testing_csr2coo(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csr2coo, parameterized_csr2coo, + testing::Combine(testing::ValuesIn(coo_M_range), + testing::ValuesIn(coo_N_range), + testing::ValuesIn(coo_idx_base_range))); From 05b66ef946251cbb46e13019d941412963736099 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 9 May 2018 13:42:41 +0200 Subject: [PATCH 040/304] added function descriptions --- library/include/rocsparse-functions.h | 194 ++++++++++++++++++++++++-- 1 file changed, 179 insertions(+), 15 deletions(-) diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index fb74373e..d88f10b5 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -26,12 +26,30 @@ extern "C" { * =========================================================================== */ - /*! \brief SPARSE Level 1 API +/*! \brief SPARSE Level 1 API - \details + \details + axpyi multiplies the sparse vector x with scalar alpha and adds the + result to the dense vector y - @param[in] + y := y + alpha * x + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + nnz number of non-zero entries of x. + @param[in] + alpha scalar alpha. + @param[in] + x_val array of nnz values. + @param[in] + x_ind array of nnz elements containing the indices of the non-zero + values of x. + @param[inout] + y array of values in dense format. + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. ********************************************************************/ ROCSPARSE_EXPORT @@ -80,11 +98,11 @@ rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, /*! \brief SPARSE Level 2 API \details - csrmv multiplies the dense vector x[i] with scalar alpha and sparse m x n - matrix A that is defined in CSR storage format and add the result to y[i] - that is multiplied by beta, for i = 1 , … , n + csrmv multiplies the dense vector x with scalar alpha and sparse m x n + matrix A that is defined in CSR storage format and adds the result to the + dense vector y that is multiplied by beta - y := alpha * op(A) * x + beta * y, + y := alpha * op(A) * x + beta * y @param[in] handle rocsparse_handle. @@ -179,7 +197,36 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, rocsparse_double_complex *y); */ -// TODO +/*! \brief SPARSE Level 2 API + + \details + hybmv multiplies the dense vector x[i] with scalar alpha and sparse m x n + matrix A that is defined in HYB storage format and add the result to y[i] + that is multiplied by beta, for i = 1 , … , n + + y := alpha * op(A) * x + beta * y, + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + trans operation type of A. + @param[in] + alpha scalar alpha. + @param[in] + descr descriptor of A. + @param[in] + hyb matrix in HYB storage format. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_shybmv(rocsparse_handle handle, rocsparse_operation trans, @@ -190,7 +237,6 @@ rocsparse_status rocsparse_shybmv(rocsparse_handle handle, const float *beta, float *y); -// TODO ROCSPARSE_EXPORT rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, rocsparse_operation trans, @@ -200,7 +246,27 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, const double *x, const double *beta, double *y); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_shybmv(rocsparse_handle handle, + rocsparse_operation trans, + const rocsparse_float_complex *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const rocsparse_float_complex *x, + const rocsparse_float_complex *beta, + rocsparse_float_complex *y); +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, + rocsparse_operation trans, + const rocsparse_double_complex *alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const rocsparse_double_complex *x, + const rocsparse_double_complex *beta, + rocsparse_double_complex *y); +*/ /* * =========================================================================== * level 3 SPARSE @@ -215,11 +281,32 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, /* * =========================================================================== - * Sparse format conversions + * Sparse Format Conversions * =========================================================================== */ -// TODO +/*! \brief SPARSE Format Conversions API + + \details + csr2coo converts the CSR array containing the row offset pointers into a + COO array of row indices. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + csr_row_ptr array of m+1 elements that point to the start of every row + of A. + @param[in] + nnz number of non-zero entries of the sparse matrix A. + @param[in] + m number of rows of the sparse matrix A. + @param[out] + coo_row_ind array of nnz elements containing the row indices of A. + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + + ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, const rocsparse_int *csr_row_ptr, @@ -228,7 +315,28 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, rocsparse_int *coo_row_ind, rocsparse_index_base idx_base); -// TODO +/*! \brief SPARSE Format Conversions API + + \details + coo2csr converts the COO array containing the row indices into a + CSR array of row offset pointers. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + coo_row_ind array of nnz elements containing the row indices of A. + @param[in] + nnz number of non-zero entries of the sparse matrix A. + @param[in] + m number of rows of the sparse matrix A. + @param[out] + csr_row_ptr array of m+1 elements that point to the start of every row + of A. + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + + ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, const rocsparse_int *coo_row_ind, @@ -237,7 +345,41 @@ rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, rocsparse_int *csr_row_ptr, rocsparse_index_base idx_base); -// TODO +/*! \brief SPARSE Format Conversions API + + \details + csr2hyb converts a CSR matrix into a HYB matrix. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + descr descriptor of A. + @param[in] + csr_val array of nnz elements of A. + @param[in] + csr_row_ptr array of m+1 elements that point to the start + of every row of A. + @param[in] + csr_col_ind array of nnz elements containing the column indices + of A. + @param[out] + hyb sparse matrix in HYB format + @param[in] + user_ell_width width of the ELL part of the HYB matrix (only + required if + partition_type == rocsparse_hyb_partition_user) + @param[in] + partition_type partitioning method can be + rocsparse_hyb_partition_auto (default) + rocsparse_hyb_partition_user + rocsparse_hyb_partition_max + + ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, rocsparse_int m, @@ -250,7 +392,6 @@ rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type); -// TODO ROCSPARSE_EXPORT rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, rocsparse_int m, @@ -262,8 +403,31 @@ rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, rocsparse_hyb_mat hyb, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const rocsparse_float_complex *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type); - +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const rocsparse_double_complex *csr_val, + const rocsparse_int *csr_row_ptr, + const rocsparse_int *csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type); +*/ #ifdef __cplusplus } #endif From 63d41d0bdf7e8abcac8f0cad8b07cd173dac32fb Mon Sep 17 00:00:00 2001 From: Nico <31079890+ntrost57@users.noreply.github.com> Date: Fri, 11 May 2018 09:22:59 +0200 Subject: [PATCH 041/304] ubuntu 16.04 dockerfile --- docker/dockerfile-build-ubuntu-16.04 | 35 ++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 docker/dockerfile-build-ubuntu-16.04 diff --git a/docker/dockerfile-build-ubuntu-16.04 b/docker/dockerfile-build-ubuntu-16.04 new file mode 100644 index 00000000..db00a038 --- /dev/null +++ b/docker/dockerfile-build-ubuntu-16.04 @@ -0,0 +1,35 @@ +# Parameters related to building rocSPARSE +ARG base_image + +FROM ${base_image} +MAINTAINER Nico Trost + +ARG user_uid + +# Install dependent packages +# Dependencies: +# * hcc-config.cmake: pkg-config +# * rocsparse-test: googletest +# * rocsparse-bench: libboost-program-options-dev +# * libhsakmt.so: libnuma1 +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + sudo \ + build-essential \ + ca-certificates \ + git \ + make \ + cmake \ + clang-format-3.8 \ + pkg-config \ + libboost-program-options-dev \ + libnuma1 \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# docker pipeline runs containers with particular uid +# create a jenkins user with this specific uid so it can use sudo priviledges +# Grant any member of sudo group password-less sudo privileges +RUN useradd --create-home -u ${user_uid} -o -G sudo --shell /bin/bash jenkins && \ + mkdir -p /etc/sudoers.d/ && \ + echo '%sudo ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd From 9dd2bc73f7c046c0cbd9c91d7d6cfabeaf058f84 Mon Sep 17 00:00:00 2001 From: Nico <31079890+ntrost57@users.noreply.github.com> Date: Fri, 11 May 2018 09:23:53 +0200 Subject: [PATCH 042/304] jenkinsfile: building rocsparse --- Jenkinsfile | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 00000000..d7c6515c --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,208 @@ +#!/usr/bin/env groovy + +// Generated from snippet generator 'properties; set job properties' +properties([buildDiscarder(logRotator( + artifactDaysToKeepStr: '', + artifactNumToKeepStr: '', + daysToKeepStr: '', + numToKeepStr: '10')), + disableConcurrentBuilds(), + // parameters([booleanParam( name: 'push_image_to_docker_hub', defaultValue: false, description: 'Push rocsparse image to rocm docker-hub' )]), + [$class: 'CopyArtifactPermissionProperty', projectNames: '*'] + ]) + +//////////////////////////////////////////////////////////////////////// +// -- AUXILLARY HELPER FUNCTIONS +// import hudson.FilePath; +import java.nio.file.Path; + +//////////////////////////////////////////////////////////////////////// +// Construct the relative path of the build directory +void build_directory_rel( project_paths paths, compiler_data hcc_args ) +{ + // if( hcc_args.build_config.equalsIgnoreCase( 'release' ) ) + // { + // paths.project_build_prefix = paths.build_prefix + '/' + paths.project_name + '/release'; + // } + // else + // { + // paths.project_build_prefix = paths.build_prefix + '/' + paths.project_name + '/debug'; + // } + paths.project_build_prefix = paths.build_prefix + '/' + paths.project_name; + +} + +//////////////////////////////////////////////////////////////////////// +// Lots of images are created above; no apparent way to delete images:tags with docker global variable +def docker_clean_images( String org, String image_name ) +{ + // Check if any images exist first grepping for image names + int docker_images = sh( script: "docker images | grep \"${org}/${image_name}\"", returnStatus: true ) + + // The script returns a 0 for success (images were found ) + if( docker_images == 0 ) + { + // run bash script to clean images:tags after successful pushing + sh "docker images | grep \"${org}/${image_name}\" | awk '{print \$1 \":\" \$2}' | xargs docker rmi" + } +} + +//////////////////////////////////////////////////////////////////////// +// -- BUILD RELATED FUNCTIONS + +//////////////////////////////////////////////////////////////////////// +// Checkout source code, source dependencies and update version number numbers +// Returns a relative path to the directory where the source exists in the workspace +void checkout_and_version( project_paths paths ) +{ + paths.project_src_prefix = paths.src_prefix + '/' + paths.project_name + + dir( paths.project_src_prefix ) + { + // checkout rocsparse + checkout([ + $class: 'GitSCM', + branches: scm.branches, + doGenerateSubmoduleConfigurations: scm.doGenerateSubmoduleConfigurations, + extensions: scm.extensions + [[$class: 'CleanCheckout']], + userRemoteConfigs: scm.userRemoteConfigs + ]) + + if( fileExists( 'CMakeLists.txt' ) ) + { + def cmake_version_file = readFile( 'CMakeLists.txt' ).trim() + //echo "cmake_version_file:\n${cmake_version_file}" + + cmake_version_file = cmake_version_file.replaceAll(/(\d+\.)(\d+\.)(\d+\.)\d+/, "\$1\$2\$3${env.BUILD_ID}") + //echo "cmake_version_file:\n${cmake_version_file}" + writeFile( file: 'CMakeLists.txt', text: cmake_version_file ) + } + } + +} + +//////////////////////////////////////////////////////////////////////// +// This creates the docker image that we use to build the project in +// The docker images contains all dependencies, including OS platform, to build +def docker_build_image( docker_data docker_args, project_paths paths ) +{ + String build_image_name = "build" + def build_image = null + + dir( paths.project_src_prefix ) + { + def user_uid = sh( script: 'id -u', returnStdout: true ).trim() + + // Docker 17.05 introduced the ability to use ARG values in FROM statements + // Docker inspect failing on FROM statements with ARG https://issues.jenkins-ci.org/browse/JENKINS-44836 + // build_image = docker.build( "${paths.project_name}/${build_image_name}:latest", "--pull -f docker/${build_docker_file} --build-arg user_uid=${user_uid} --build-arg base_image=${from_image} ." ) + + // JENKINS-44836 workaround by using a bash script instead of docker.build() + sh "docker build -t ${paths.project_name}/${build_image_name}:latest -f docker/${docker_args.build_docker_file} ${docker_args.docker_build_args} --build-arg user_uid=${user_uid} --build-arg base_image=${docker_args.from_image} ." + build_image = docker.image( "${paths.project_name}/${build_image_name}:latest" ) + } + + return build_image +} + +// Docker related variables gathered together to reduce parameter bloat on function calls +class docker_data implements Serializable +{ + String from_image + String build_docker_file + String install_docker_file + String docker_run_args + String docker_build_args +} + +// Docker related variables gathered together to reduce parameter bloat on function calls +class compiler_data implements Serializable +{ + String compiler_name + String build_config + String compiler_path +} + +// Paths variables bundled together to reduce parameter bloat on function calls +class project_paths implements Serializable +{ + String project_name + String src_prefix + String project_src_prefix + String build_prefix + String project_build_prefix + String build_command +} + +// This defines a common build pipeline used by most targets +def build_pipeline( compiler_data compiler_args, docker_data docker_args, project_paths rocsparse_paths, def docker_inside_closure ) +{ + ansiColor( 'vga' ) + { + // NOTE: build_succeeded does not appear to be local to each function invokation. I couldn't use it where each + // node had a different success value. + def build_succeeded = false; + + stage( "Build ${compiler_args.compiler_name} ${compiler_args.build_config}" ) + { + // Checkout source code, dependencies and version files + checkout_and_version( rocsparse_paths ) + + // Conctruct a binary directory path based on build config + build_directory_rel( rocsparse_paths, compiler_args ); + + // Create/reuse a docker image that represents the rocsparse build environment + def rocsparse_build_image = docker_build_image( docker_args, rocsparse_paths ) + + // Print system information for the log + rocsparse_build_image.inside( docker_args.docker_run_args, docker_inside_closure ) + + // Build rocsparse inside of the build environment + build_succeeded = docker_build_inside_image( rocsparse_build_image, compiler_args, docker_args, rocsparse_paths ) + } + + // After a successful build, test the installer + // Only do this for rocm based builds + if( compiler_args.compiler_name.toLowerCase( ).startsWith( 'hcc-' ) ) + { + String job_name = env.JOB_NAME.toLowerCase( ) + String rocsparse_image_name = docker_test_install( compiler_args, docker_args, rocsparse_paths, job_name ) + + docker_clean_images( job_name, rocsparse_image_name ) + } + } +} + +hcc_rocm: +{ + node( 'docker && rocm && dkms' ) + { + def hcc_docker_args = new docker_data( + from_image:'rocm/dev-ubuntu-16.04:1.7.1', + build_docker_file:'dockerfile-build-ubuntu-16.04', + install_docker_file:'dockerfile-rocsparse-ubuntu-16.04', + docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', + docker_build_args:' --pull' ) + + def hcc_compiler_args = new compiler_data( + compiler_name:'hcc-rocm', + build_config:'Release', + compiler_path:'/opt/rocm/bin/hcc' ) + + def rocsparse_paths = new project_paths( + project_name:'rocsparse-hcc-rocm', + src_prefix:'src', + build_prefix:'src', + build_command: './install.sh -cd' ) + + def print_version_closure = { + sh """ + set -x + /opt/rocm/bin/rocm_agent_enumerator -t ALL + /opt/rocm/bin/hcc --version + """ + } + + build_pipeline( hcc_compiler_args, hcc_docker_args, rocsparse_paths, print_version_closure ) + } +} From 9ec7992c2a7474b7503e51151c553e5f6204a642 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 09:28:06 +0200 Subject: [PATCH 043/304] jenkinsfile update --- Jenkinsfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index d7c6515c..73bada7e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -158,7 +158,7 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec rocsparse_build_image.inside( docker_args.docker_run_args, docker_inside_closure ) // Build rocsparse inside of the build environment - build_succeeded = docker_build_inside_image( rocsparse_build_image, compiler_args, docker_args, rocsparse_paths ) +// build_succeeded = docker_build_inside_image( rocsparse_build_image, compiler_args, docker_args, rocsparse_paths ) } // After a successful build, test the installer @@ -166,9 +166,9 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec if( compiler_args.compiler_name.toLowerCase( ).startsWith( 'hcc-' ) ) { String job_name = env.JOB_NAME.toLowerCase( ) - String rocsparse_image_name = docker_test_install( compiler_args, docker_args, rocsparse_paths, job_name ) +// String rocsparse_image_name = docker_test_install( compiler_args, docker_args, rocsparse_paths, job_name ) - docker_clean_images( job_name, rocsparse_image_name ) +// docker_clean_images( job_name, rocsparse_image_name ) } } } From 7ca27e8f5e613314672596d5a028e4b8ca2e1016 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 12:29:08 +0200 Subject: [PATCH 044/304] Jenkinsfile update --- Jenkinsfile | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 73bada7e..67bf213d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -105,6 +105,114 @@ def docker_build_image( docker_data docker_args, project_paths paths ) return build_image } +//////////////////////////////////////////////////////////////////////// +// This encapsulates the cmake configure, build and package commands +// Leverages docker containers to encapsulate the build in a fixed environment +Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, docker_data docker_args, project_paths paths ) +{ + // Construct a relative path from build directory to src directory; used to invoke cmake + String rel_path_to_src = g_relativize( pwd( ), paths.project_src_prefix, paths.project_build_prefix ) + + String build_type_postfix = null + if( compiler_args.build_config.equalsIgnoreCase( 'release' ) ) + { + build_type_postfix = "" + } + else + { + build_type_postfix = "-d" + } + + // For the nvidia path, we somewhat arbitrarily choose to use the hcc-ctu rocsparse package + String rocsparse_archive_path=compiler_args.compiler_name; + if( rocsparse_archive_path.toLowerCase( ).startsWith( 'nvcc-' ) ) + { + rocsparse_archive_path='hcc-ctu' + } + + build_image.inside( docker_args.docker_run_args ) + { + withEnv(["CXX=${compiler_args.compiler_path}", 'CLICOLOR_FORCE=1']) + { + // Build library & clients + sh """#!/usr/bin/env bash + set -x + cd ${paths.project_build_prefix} + ${paths.build_command} + """ + } + +// stage( "Test ${compiler_args.compiler_name} ${compiler_args.build_config}" ) +// { +// // Cap the maximum amount of testing to be a few hours; assume failure if the time limit is hit +// timeout(time: 1, unit: 'HOURS') +// { +// sh """#!/usr/bin/env bash +// set -x +// cd ${paths.project_build_prefix}/build/release/clients/staging +// ./rocsparse-test${build_type_postfix} --gtest_output=xml --gtest_color=yes +// """ +// junit "${paths.project_build_prefix}/build/release/clients/staging/*.xml" +// } +// +// String docker_context = "${compiler_args.build_config}/${compiler_args.compiler_name}" +// if( compiler_args.compiler_name.toLowerCase( ).startsWith( 'hcc-' ) ) +// { +// sh """#!/usr/bin/env bash +// set -x +// cd ${paths.project_build_prefix}/build/release +// make package +// """ +// +// sh """#!/usr/bin/env bash +// set -x +// rm -rf ${docker_context} && mkdir -p ${docker_context} +// mv ${paths.project_build_prefix}/build/release/*.deb ${docker_context} +// # mv ${paths.project_build_prefix}/build/release/*.rpm ${docker_context} +// dpkg -c ${docker_context}/*.deb +// """ +// +// archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true +// // archiveArtifacts artifacts: "${docker_context}/*.rpm", fingerprint: true +// } + } + } + + return true +} + +//////////////////////////////////////////////////////////////////////// +// This builds a fresh docker image FROM a clean base image, with no build dependencies included +// Uploads the new docker image to internal artifactory +// String docker_test_install( String hcc_ver, String artifactory_org, String from_image, String rocsparse_src_rel, String build_dir_rel ) +String docker_test_install( compiler_data compiler_args, docker_data docker_args, project_paths rocsparse_paths, String job_name ) +{ + def rocsparse_install_image = null + String image_name = "rocsparse-hip-${compiler_args.compiler_name}-ubuntu-16.04" + String docker_context = "${compiler_args.build_config}/${compiler_args.compiler_name}" + +// stage( "Artifactory ${compiler_args.compiler_name} ${compiler_args.build_config}" ) +// { +// // We copy the docker files into the bin directory where the .deb lives so that it's a clean build everytime +// sh """#!/usr/bin/env bash +// set -x +// mkdir -p ${docker_context} +// cp -r ${rocsparse_paths.project_src_prefix}/docker/* ${docker_context} +// """ +// +// // Docker 17.05 introduced the ability to use ARG values in FROM statements +// // Docker inspect failing on FROM statements with ARG https://issues.jenkins-ci.org/browse/JENKINS-44836 +// // rocsparse_install_image = docker.build( "${job_name}/${image_name}:${env.BUILD_NUMBER}", "--pull -f ${build_dir_rel}/dockerfile-rocsparse-ubuntu-16.04 --build-arg base_image=${from_image} ${build_dir_rel}" ) +// +// // JENKINS-44836 workaround by using a bash script instead of docker.build() +// sh """docker build -t ${job_name}/${image_name} --pull -f ${docker_context}/${docker_args.install_docker_file} \ +// --build-arg base_image=${docker_args.from_image} ${docker_context}""" +// rocsparse_install_image = docker.image( "${job_name}/${image_name}" ) +// } + + return image_name +} + // Docker related variables gathered together to reduce parameter bloat on function calls class docker_data implements Serializable { @@ -158,7 +266,7 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec rocsparse_build_image.inside( docker_args.docker_run_args, docker_inside_closure ) // Build rocsparse inside of the build environment -// build_succeeded = docker_build_inside_image( rocsparse_build_image, compiler_args, docker_args, rocsparse_paths ) + build_succeeded = docker_build_inside_image( rocsparse_build_image, compiler_args, docker_args, rocsparse_paths ) } // After a successful build, test the installer @@ -166,9 +274,9 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec if( compiler_args.compiler_name.toLowerCase( ).startsWith( 'hcc-' ) ) { String job_name = env.JOB_NAME.toLowerCase( ) -// String rocsparse_image_name = docker_test_install( compiler_args, docker_args, rocsparse_paths, job_name ) + String rocsparse_image_name = docker_test_install( compiler_args, docker_args, rocsparse_paths, job_name ) -// docker_clean_images( job_name, rocsparse_image_name ) + docker_clean_images( job_name, rocsparse_image_name ) } } } From 3a1dd2a4d24f96c61a5009a0480ec3547024b29a Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 12:45:46 +0200 Subject: [PATCH 045/304] axpyi test fix --- clients/include/testing_axpyi.hpp | 34 +++++++++++++++---------------- clients/tests/test_axpyi.cpp | 16 +++++++-------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/clients/include/testing_axpyi.hpp b/clients/include/testing_axpyi.hpp index 2faea25a..051d229c 100644 --- a/clients/include/testing_axpyi.hpp +++ b/clients/include/testing_axpyi.hpp @@ -24,7 +24,7 @@ void testing_axpyi_bad_arg(void) I nnz = 100; I safe_size = 100; T alpha = 0.6; - base idxBase = rocsparse_index_base_zero; + base idx_base = rocsparse_index_base_zero; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); @@ -50,31 +50,31 @@ void testing_axpyi_bad_arg(void) // testing for (nullptr == dxInd) { I *dxInd_null = nullptr; - status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd_null, dy, idxBase); + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd_null, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: xInd is nullptr"); } // testing for (nullptr == dxVal) { T *dxVal_null = nullptr; - status = rocsparse_axpyi(handle, nnz, &alpha, dxVal_null, dxInd, dy, idxBase); + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal_null, dxInd, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: xVal is nullptr"); } // testing for (nullptr == dy) { T *dy_null = nullptr; - status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd, dy_null, idxBase); + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd, dy_null, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); } // testing for (nullptr == d_alpha) { T *d_alpha_null = nullptr; - status = rocsparse_axpyi(handle, nnz, d_alpha_null, dxVal, dxInd, dy, idxBase); + status = rocsparse_axpyi(handle, nnz, d_alpha_null, dxVal, dxInd, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for (nullptr == handle) { rocsparse_handle handle_null = nullptr; - status = rocsparse_axpyi(handle_null, nnz, &alpha, dxVal, dxInd, dy, idxBase); + status = rocsparse_axpyi(handle_null, nnz, &alpha, dxVal, dxInd, dy, idx_base); verify_rocsparse_status_invalid_handle(status); } } @@ -82,11 +82,11 @@ void testing_axpyi_bad_arg(void) template rocsparse_status testing_axpyi(Arguments argus) { - I N = argus.N; - I nnz = argus.nnz; - I safe_size = 100; - T h_alpha = argus.alpha; - rocsparse_index_base idxBase = argus.idxBase; + I N = argus.N; + I nnz = argus.nnz; + I safe_size = 100; + T h_alpha = argus.alpha; + rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; std::unique_ptr test_handle(new handle_struct); @@ -113,7 +113,7 @@ rocsparse_status testing_axpyi(Arguments argus) } CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - status = rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy, idxBase); + status = rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy, idx_base); if (nnz < 0) { @@ -184,11 +184,11 @@ rocsparse_status testing_axpyi(Arguments argus) // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idxBase)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idx_base)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); - CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idxBase)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idx_base)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * N, hipMemcpyDeviceToHost)); @@ -199,7 +199,7 @@ rocsparse_status testing_axpyi(Arguments argus) for (int i=0; i axpyi_tuple; int axpyi_N_range[] = {12000, 15332, 22031}; int axpyi_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; std::vector axpyi_alpha_range = {1.0, 0.0}; -base axpyi_idxBase_range[] = {rocsparse_index_base_zero, - rocsparse_index_base_one}; +base axpyi_idx_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; class parameterized_axpyi : public testing::TestWithParam { @@ -30,11 +30,11 @@ class parameterized_axpyi : public testing::TestWithParam Arguments setup_axpyi_arguments(axpyi_tuple tup) { Arguments arg; - arg.N = std::get<0>(tup); - arg.nnz = std::get<1>(tup); - arg.alpha = std::get<2>(tup); - arg.idxBase = std::get<3>(tup); - arg.timing = 0; + arg.N = std::get<0>(tup); + arg.nnz = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.idx_base = std::get<3>(tup); + arg.timing = 0; return arg; } @@ -61,4 +61,4 @@ INSTANTIATE_TEST_CASE_P(axpyi, parameterized_axpyi, testing::Combine(testing::ValuesIn(axpyi_N_range), testing::ValuesIn(axpyi_nnz_range), testing::ValuesIn(axpyi_alpha_range), - testing::ValuesIn(axpyi_idxBase_range))); + testing::ValuesIn(axpyi_idx_base_range))); From aa3412012085adf31ba5c40b9c679e493aab967d Mon Sep 17 00:00:00 2001 From: Nico <31079890+ntrost57@users.noreply.github.com> Date: Fri, 11 May 2018 12:50:20 +0200 Subject: [PATCH 046/304] axpyi test fix (#4) --- clients/include/testing_axpyi.hpp | 34 +++++++++++++++---------------- clients/tests/test_axpyi.cpp | 16 +++++++-------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/clients/include/testing_axpyi.hpp b/clients/include/testing_axpyi.hpp index 2faea25a..051d229c 100644 --- a/clients/include/testing_axpyi.hpp +++ b/clients/include/testing_axpyi.hpp @@ -24,7 +24,7 @@ void testing_axpyi_bad_arg(void) I nnz = 100; I safe_size = 100; T alpha = 0.6; - base idxBase = rocsparse_index_base_zero; + base idx_base = rocsparse_index_base_zero; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); @@ -50,31 +50,31 @@ void testing_axpyi_bad_arg(void) // testing for (nullptr == dxInd) { I *dxInd_null = nullptr; - status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd_null, dy, idxBase); + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd_null, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: xInd is nullptr"); } // testing for (nullptr == dxVal) { T *dxVal_null = nullptr; - status = rocsparse_axpyi(handle, nnz, &alpha, dxVal_null, dxInd, dy, idxBase); + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal_null, dxInd, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: xVal is nullptr"); } // testing for (nullptr == dy) { T *dy_null = nullptr; - status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd, dy_null, idxBase); + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd, dy_null, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); } // testing for (nullptr == d_alpha) { T *d_alpha_null = nullptr; - status = rocsparse_axpyi(handle, nnz, d_alpha_null, dxVal, dxInd, dy, idxBase); + status = rocsparse_axpyi(handle, nnz, d_alpha_null, dxVal, dxInd, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for (nullptr == handle) { rocsparse_handle handle_null = nullptr; - status = rocsparse_axpyi(handle_null, nnz, &alpha, dxVal, dxInd, dy, idxBase); + status = rocsparse_axpyi(handle_null, nnz, &alpha, dxVal, dxInd, dy, idx_base); verify_rocsparse_status_invalid_handle(status); } } @@ -82,11 +82,11 @@ void testing_axpyi_bad_arg(void) template rocsparse_status testing_axpyi(Arguments argus) { - I N = argus.N; - I nnz = argus.nnz; - I safe_size = 100; - T h_alpha = argus.alpha; - rocsparse_index_base idxBase = argus.idxBase; + I N = argus.N; + I nnz = argus.nnz; + I safe_size = 100; + T h_alpha = argus.alpha; + rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; std::unique_ptr test_handle(new handle_struct); @@ -113,7 +113,7 @@ rocsparse_status testing_axpyi(Arguments argus) } CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - status = rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy, idxBase); + status = rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy, idx_base); if (nnz < 0) { @@ -184,11 +184,11 @@ rocsparse_status testing_axpyi(Arguments argus) // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idxBase)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idx_base)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); - CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idxBase)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idx_base)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * N, hipMemcpyDeviceToHost)); @@ -199,7 +199,7 @@ rocsparse_status testing_axpyi(Arguments argus) for (int i=0; i axpyi_tuple; int axpyi_N_range[] = {12000, 15332, 22031}; int axpyi_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; std::vector axpyi_alpha_range = {1.0, 0.0}; -base axpyi_idxBase_range[] = {rocsparse_index_base_zero, - rocsparse_index_base_one}; +base axpyi_idx_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; class parameterized_axpyi : public testing::TestWithParam { @@ -30,11 +30,11 @@ class parameterized_axpyi : public testing::TestWithParam Arguments setup_axpyi_arguments(axpyi_tuple tup) { Arguments arg; - arg.N = std::get<0>(tup); - arg.nnz = std::get<1>(tup); - arg.alpha = std::get<2>(tup); - arg.idxBase = std::get<3>(tup); - arg.timing = 0; + arg.N = std::get<0>(tup); + arg.nnz = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.idx_base = std::get<3>(tup); + arg.timing = 0; return arg; } @@ -61,4 +61,4 @@ INSTANTIATE_TEST_CASE_P(axpyi, parameterized_axpyi, testing::Combine(testing::ValuesIn(axpyi_N_range), testing::ValuesIn(axpyi_nnz_range), testing::ValuesIn(axpyi_alpha_range), - testing::ValuesIn(axpyi_idxBase_range))); + testing::ValuesIn(axpyi_idx_base_range))); From 34e73427a816e68e6aced7fd1829901e167c5be3 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 12:55:44 +0200 Subject: [PATCH 047/304] install script and dependencies --- cmake/get-cli-arguments.cmake | 25 ++++ deps/CMakeLists.txt | 69 ++++++++++ deps/external-boost.cmake | 181 ++++++++++++++++++++++++++ deps/external-gtest.cmake | 99 +++++++++++++++ install.sh | 231 ++++++++++++++++++++++++++++++++++ 5 files changed, 605 insertions(+) create mode 100644 cmake/get-cli-arguments.cmake create mode 100644 deps/CMakeLists.txt create mode 100644 deps/external-boost.cmake create mode 100644 deps/external-gtest.cmake create mode 100755 install.sh diff --git a/cmake/get-cli-arguments.cmake b/cmake/get-cli-arguments.cmake new file mode 100644 index 00000000..110bcfaa --- /dev/null +++ b/cmake/get-cli-arguments.cmake @@ -0,0 +1,25 @@ +# Attempt (best effort) to return a list of user specified parameters cmake was invoked with +# NOTE: Even if the user specifies CMAKE_INSTALL_PREFIX on the command line, the parameter is +# not returned because it does not have the matching helpstring + +function( append_cmake_cli_arguments initial_cli_args return_cli_args ) + + # Retrieves the contents of CMakeCache.txt + get_cmake_property( cmake_properties CACHE_VARIABLES ) + + foreach( property ${cmake_properties} ) + get_property(help_string CACHE ${property} PROPERTY HELPSTRING ) + + # Properties specified on the command line have boilerplate text + if( help_string MATCHES "variable specified on the command line" ) + # message( STATUS "property: ${property}") + # message( STATUS "value: ${${property}}") + + list( APPEND cli_args "-D${property}=${${property}}") + endif( ) + endforeach( ) + + # message( STATUS "get_command_line_arguments: ${cli_args}") + set( ${return_cli_args} ${${initial_cli_args}} ${cli_args} PARENT_SCOPE ) + +endfunction( ) \ No newline at end of file diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt new file mode 100644 index 00000000..0092d61e --- /dev/null +++ b/deps/CMakeLists.txt @@ -0,0 +1,69 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# ######################################################################## +# Helper cmake script to automate building dependencies for rocsparse +# This script can be invoked manually by the user with 'cmake -P' + +# The ROCm platform requires Ubuntu 16.04 or Fedora 24, which has cmake 3.5 +cmake_minimum_required( VERSION 3.5 ) + +list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../cmake ) + +# Consider removing this in the future +# It can be annoying for visual studio developers to build a project that tries to install into 'program files' +if( WIN32 AND CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) + set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE ) +endif( ) + +# This has to be initialized before the project() command appears +# Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE +if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) + set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) +endif() + +# The superbuild does not build anything itself; all compiling is done in external projects +project( rocsparse-dependencies NONE ) + +option( BUILD_BOOST "Download and build boost library" ON ) +option( BUILD_GTEST "Download and build googletest library" ON ) +# option( BUILD_VERBOSE "Print helpful build debug information" OFF ) + +# if( BUILD_VERBOSE ) +# message( STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}" ) +# message( STATUS "CMAKE_BINARY_DIR: ${CMAKE_BINARY_DIR}" ) +# message( STATUS "CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}" ) +# message( STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}" ) +# message( STATUS "CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}" ) +# message( STATUS "CMAKE_CURRENT_LIST_DIR: ${CMAKE_CURRENT_LIST_DIR}" ) +# message( STATUS "CMAKE_CURRENT_LIST_FILE: ${CMAKE_CURRENT_LIST_FILE}" ) +# endif( ) + +# This module scrapes the CMakeCache.txt file and attempts to get all the cli options the user specified to cmake invocation +include( get-cli-arguments ) + +# The following is a series of super-build projects; this cmake project will download and build +if( BUILD_GTEST ) + include( external-gtest ) + + list( APPEND rocsparse_dependencies googletest ) + set( gtest_custom_target COMMAND cd ${GTEST_BINARY_ROOT}$ ${CMAKE_COMMAND} --build . --target install ) +endif( ) + +if( BUILD_BOOST ) + include( external-boost ) + + list( APPEND rocsparse_dependencies boost ) + set( boost_custom_target COMMAND cd ${BOOST_BINARY_ROOT}$ ${Boost.Command} install ) +endif( ) + +# POLICY CMP0037 - "Target names should not be reserved and should match a validity pattern" +# Familiar target names like 'install' should be OK at the super-build level +if( POLICY CMP0037 ) + cmake_policy( SET CMP0037 OLD ) +endif( ) + +add_custom_target( install + ${boost_custom_target} + ${gtest_custom_target} + DEPENDS ${rocsparse_dependencies} +) diff --git a/deps/external-boost.cmake b/deps/external-boost.cmake new file mode 100644 index 00000000..c6f22493 --- /dev/null +++ b/deps/external-boost.cmake @@ -0,0 +1,181 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# ######################################################################## + +message( STATUS "Configuring boost external dependency" ) +include( ExternalProject ) +set( PREFIX_BOOST ${CMAKE_INSTALL_PREFIX} CACHE PATH "Location where boost should install, defaults to /usr/local" ) + +# We need to detect the compiler the user is attempting to invoke with CMake, +# we do our best to translate cmake parameters into bjam parameters +enable_language( CXX ) +include( build-bitness ) + +# TODO: Options should be added to allow downloading Boost straight from github + +# This file is used to add Boost as a library dependency to another project +# This sets up boost to download from sourceforge, and builds it as a cmake +# ExternalProject + +# Change this one line to upgrade to newer versions of boost +set( ext.Boost_VERSION "1.64.0" CACHE STRING "Boost version to download/use" ) +mark_as_advanced( ext.Boost_VERSION ) +string( REPLACE "." "_" ext.Boost_Version_Underscore ${ext.Boost_VERSION} ) + +message( STATUS "ext.Boost_VERSION: " ${ext.Boost_VERSION} ) + +if( WIN32 ) + # For newer cmake versions, 7z archives are much smaller to download + if( CMAKE_VERSION VERSION_LESS "3.1.0" ) + set( Boost_Ext "zip" ) + else( ) + set( Boost_Ext "7z" ) + endif( ) +else( ) + set( Boost_Ext "tar.bz2" ) +endif( ) + +if( WIN32 ) + set( Boost.Command b2 --prefix=${PREFIX_BOOST} ) +else( ) + set( Boost.Command ./b2 --prefix=${PREFIX_BOOST} ) +endif( ) + +if( CMAKE_COMPILER_IS_GNUCXX ) + list( APPEND Boost.Command cxxflags=-fPIC -std=c++11 ) +elseif( XCODE_VERSION OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang") ) + list( APPEND Boost.Command cxxflags=-std=c++11 -stdlib=libc++ linkflags=-stdlib=libc++ ) +endif( ) + +include( ProcessorCount ) +ProcessorCount( Cores ) +if( NOT Cores EQUAL 0 ) + # Travis can fail to build Boost sporadically; uses 32 cores, reduce stress on VM + if( DEFINED ENV{TRAVIS} ) + if( Cores GREATER 8 ) + set( Cores 8 ) + endif( ) + endif( ) + + # Add build thread in addition to the number of cores that we have + math( EXPR Cores "${Cores} + 1 " ) +else( ) + # If we could not detect # of cores, assume 1 core and add an additional build thread + set( Cores "2" ) +endif( ) + +message( STATUS "ExternalBoost using ( " ${Cores} " ) cores to build with" ) +message( STATUS "ExternalBoost building [ program_options, serialization, filesystem, system, regex ] components" ) + +list( APPEND Boost.Command -j ${Cores} --with-program_options --with-serialization --with-filesystem --with-system --with-regex ) + +if( BUILD_64 ) + list( APPEND Boost.Command address-model=64 ) +else( ) + list( APPEND Boost.Command address-model=32 ) +endif( ) + +if( MSVC10 ) + list( APPEND Boost.Command toolset=msvc-10.0 ) +elseif( MSVC11 ) + list( APPEND Boost.Command toolset=msvc-11.0 ) +elseif( MSVC12 ) + list( APPEND Boost.Command toolset=msvc-12.0 ) +elseif( MSVC14 ) + list( APPEND Boost.Command toolset=msvc-14.0 ) +elseif( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) + list( APPEND Boost.Command toolset=clang ) +elseif( CMAKE_COMPILER_IS_GNUCXX ) + list( APPEND Boost.Command toolset=gcc ) +endif( ) + +if( WIN32 AND (ext.Boost_VERSION VERSION_LESS "1.60.0") ) + list( APPEND Boost.Command define=BOOST_LOG_USE_WINNT6_API ) +endif( ) + +if( NOT DEFINED ext.Boost_LINK ) + if( ${BUILD_SHARED_LIBS} MATCHES "ON" ) + set( ext.Boost_LINK "shared" CACHE STRING "Which boost link method? static | shared | static,shared" ) + else( ) + set( ext.Boost_LINK "static" CACHE STRING "Which boost link method? static | shared | static,shared" ) + endif( ) +endif() +mark_as_advanced( ext.Boost_LINK ) + +if( WIN32 ) + # Versioned is the default on windows + set( ext.Boost_LAYOUT "versioned" CACHE STRING "Which boost layout method? versioned | tagged | system" ) + + # For windows, default to build both variants to support the VS IDE + set( ext.Boost_VARIANT "debug,release" CACHE STRING "Which boost variant? debug | release | debug,release" ) +else( ) + # Tagged builds provide unique enough names to be able to build both variants + set( ext.Boost_LAYOUT "tagged" CACHE STRING "Which boost layout method? versioned | tagged | system" ) + + # For Linux, typically a build tree only needs one variant + if( ${CMAKE_BUILD_TYPE} MATCHES "Debug") + set( ext.Boost_VARIANT "debug" CACHE STRING "Which boost variant? debug | release | debug,release" ) + else( ) + set( ext.Boost_VARIANT "release" CACHE STRING "Which boost variant? debug | release | debug,release" ) + endif( ) +endif( ) +mark_as_advanced( ext.Boost_LAYOUT ) +mark_as_advanced( ext.Boost_VARIANT ) + +list( APPEND Boost.Command --layout=${ext.Boost_LAYOUT} link=${ext.Boost_LINK} variant=${ext.Boost_VARIANT} ) + +message( STATUS "Boost.Command: ${Boost.Command}" ) + +# If the user has a cached local copy stored somewhere, they can define the full path to the package in a BOOST_URL environment variable +if( DEFINED ENV{BOOST_URL} ) + set( ext.Boost_URL "$ENV{BOOST_URL}" CACHE STRING "URL to download Boost from" ) +else( ) + set( ext.Boost_URL "http://sourceforge.net/projects/boost/files/boost/${ext.Boost_VERSION}/boost_${ext.Boost_Version_Underscore}.${Boost_Ext}/download" CACHE STRING "URL to download Boost from" ) +endif( ) +mark_as_advanced( ext.Boost_URL ) + +set( Boost.Bootstrap "" ) +set( ext.HASH "" ) +if( WIN32 ) + set( Boost.Bootstrap "bootstrap.bat" ) + + if( CMAKE_VERSION VERSION_LESS "3.1.0" ) + # .zip file + set( ext.HASH "b99973c805f38b549dbeaf88701c0abeff8b0e8eaa4066df47cac10a32097523" ) + else( ) + # .7z file + set( ext.HASH "49c6abfeb5b480f6a86119c0d57235966b4690ee6ff9e6401ee868244808d155" ) + endif( ) +else( ) + set( Boost.Bootstrap "./bootstrap.sh" ) + + # .tar.bz2 + set( ext.HASH "7bcc5caace97baa948931d712ea5f37038dbb1c5d89b43ad4def4ed7cb683332" ) + + if( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) + list( APPEND Boost.Bootstrap --with-toolset=clang ) + endif( ) +endif( ) + +# Below is a fancy CMake command to download, build and install Boost on the users computer +ExternalProject_Add( + boost + PREFIX ${CMAKE_BINARY_DIR}/boost + URL ${ext.Boost_URL} + URL_HASH SHA256=${ext.HASH} + UPDATE_COMMAND ${Boost.Bootstrap} + LOG_UPDATE 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND ${Boost.Command} stage + BUILD_IN_SOURCE 1 + LOG_BUILD 1 + INSTALL_COMMAND "" +) + +set_property( TARGET boost PROPERTY FOLDER "extern" ) +ExternalProject_Get_Property( boost install_dir ) +ExternalProject_Get_Property( boost binary_dir ) + +# For use by the user of ExternalGtest.cmake +set( BOOST_INSTALL_ROOT ${install_dir} ) +set( BOOST_BINARY_ROOT ${binary_dir} ) diff --git a/deps/external-gtest.cmake b/deps/external-gtest.cmake new file mode 100644 index 00000000..641418c9 --- /dev/null +++ b/deps/external-gtest.cmake @@ -0,0 +1,99 @@ +# ######################################################################## +# Copyright 2016 Advanced Micro Devices, Inc. +# ######################################################################## + +message( STATUS "Configuring gtest external dependency" ) +include( ExternalProject ) + +# set( gtest_cmake_args -DCMAKE_INSTALL_PREFIX=/package ) +set( PREFIX_GTEST ${CMAKE_INSTALL_PREFIX} CACHE PATH "Location where boost should install, defaults to /usr/local" ) +set( gtest_cmake_args -DCMAKE_INSTALL_PREFIX=${PREFIX_GTEST} ) +append_cmake_cli_arguments( gtest_cmake_args gtest_cmake_args ) + +set( gtest_git_repository "https://github.com/google/googletest.git" CACHE STRING "URL to download gtest from" ) +set( gtest_git_tag "release-1.8.0" CACHE STRING "URL to download gtest from" ) + +if( MSVC ) + list( APPEND gtest_cmake_args -Dgtest_force_shared_crt=ON -DCMAKE_DEBUG_POSTFIX=d ) +# else( ) + # GTEST_USE_OWN_TR1_TUPLE necessary to compile with hipcc + # list( APPEND gtest_cmake_args -DGTEST_USE_OWN_TR1_TUPLE=1 ) +endif( ) + +if( CMAKE_CONFIGURATION_TYPES ) + set( gtest_make + COMMAND ${CMAKE_COMMAND} --build --config Release + COMMAND ${CMAKE_COMMAND} --build --config Debug + ) +else( ) + # Add build thread in addition to the number of cores that we have + include( ProcessorCount ) + ProcessorCount( Cores ) + + # If we are not using an IDE, assume nmake with visual studio + if( MSVC ) + set( gtest_make "nmake" ) + else( ) + set( gtest_make "make" ) + + # The -j paramter does not work with nmake + if( NOT Cores EQUAL 0 ) + math( EXPR Cores "${Cores} + 1 " ) + list( APPEND gtest_make -j ${Cores} ) + else( ) + # If we could not detect # of cores, assume 1 core and add an additional build thread + list( APPEND gtest_make -j 2 ) + endif( ) + endif( ) + + message( STATUS "ExternalGmock using ( " ${Cores} " ) cores to build with" ) +endif( ) + +# message( STATUS "gtest_make ( " ${gtest_make} " ) " ) +# message( STATUS "gtest_cmake_args ( " ${gtest_cmake_args} " ) " ) + +# Master branch has a new structure that combines googletest with googlemock +ExternalProject_Add( + googletest + PREFIX ${CMAKE_BINARY_DIR}/gtest + GIT_REPOSITORY ${gtest_git_repository} + GIT_TAG ${gtest_git_tag} + CMAKE_ARGS ${gtest_cmake_args} + BUILD_COMMAND ${gtest_make} + LOG_BUILD 1 + INSTALL_COMMAND "" + LOG_INSTALL 1 +) + +ExternalProject_Get_Property( googletest source_dir ) + +# For visual studio, the path 'debug' is hardcoded because that is the default VS configuration for a build. +# Doesn't matter if its the gtest or gtestd project above +set( package_dir "${PREFIX_GTEST}" ) +if( CMAKE_CONFIGURATION_TYPES ) + # Create a package by bundling libraries and header files + if( BUILD_64 ) + set( LIB_DIR lib64 ) + else( ) + set( LIB_DIR lib ) + endif( ) + + set( gtest_lib_dir "/${LIB_DIR}" ) + ExternalProject_Add_Step( googletest createPackage + COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Debug ${package_dir}/${LIB_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Release ${package_dir}/${LIB_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Debug ${package_dir}/${LIB_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Release ${package_dir}/${LIB_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory /include ${package_dir}/include + COMMAND ${CMAKE_COMMAND} -E copy_directory /gtest/include/gtest ${package_dir}/include/gtest + DEPENDEES install + ) +endif( ) + +set_property( TARGET googletest PROPERTY FOLDER "extern") +ExternalProject_Get_Property( googletest install_dir ) +ExternalProject_Get_Property( googletest binary_dir ) + +# For use by the user of ExternalGtest.cmake +set( GTEST_INSTALL_ROOT ${install_dir} ) +set( GTEST_BINARY_ROOT ${binary_dir} ) diff --git a/install.sh b/install.sh new file mode 100755 index 00000000..94775f45 --- /dev/null +++ b/install.sh @@ -0,0 +1,231 @@ +#!/usr/bin/env bash +# Author: Kent Knox + +# ################################################# +# Pre-requisites check +# ################################################# +# Exit code 0: alls well +# Exit code 1: problems with getopt +# Exit code 2: problems with supported platforms + +# check if getopt command is installed +type getopt > /dev/null +if [[ $? -ne 0 ]]; then + echo "This script uses getopt to parse arguments; try installing the util-linux package"; + exit 1 +fi + +# lsb-release file describes the system +if [[ ! -e "/etc/lsb-release" ]]; then + echo "This script depends on the /etc/lsb-release file" + exit 2 +fi +source /etc/lsb-release + +if [[ ${DISTRIB_ID} != Ubuntu ]]; then + echo "This script only validated with Ubuntu" + exit 2 +fi + +# ################################################# +# helper functions +# ################################################# +function display_help() +{ + echo "rocsparse build & installation helper script" + echo "./install [-h|--help] " + echo " [-h|--help] prints this help message" + echo " [-i|--install] install after build" + echo " [-d|--dependencies] install build dependencies" + echo " [-c|--clients] build library clients too (combines with -i & -d)" + echo " [-g|--debug] -DCMAKE_BUILD_TYPE=Debug (default is =Release)" + echo " [--cuda] build library for cuda backend" +} + +# This function is helpful for dockerfiles that do not have sudo installed, but the default user is root +elevate_if_not_root( ) +{ + local uid=$(id -u) + + if (( ${uid} )); then + sudo $@ + else + $@ + fi +} + +# ################################################# +# global variables +# ################################################# +install_package=false +install_dependencies=false +build_clients=false +build_cuda=false +build_release=true + +# ################################################# +# Parameter parsing +# ################################################# + +# check if we have a modern version of getopt that can handle whitespace and long parameters +getopt -T +if [[ $? -eq 4 ]]; then + GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,install,clients,dependencies,debug,cuda --options hicgd -- "$@") +else + echo "Need a new version of getopt" + exit 1 +fi + +if [[ $? -ne 0 ]]; then + echo "getopt invocation failed; could not parse the command line"; + exit 1 +fi + +eval set -- "${GETOPT_PARSE}" + +while true; do + case "${1}" in + -h|--help) + display_help + exit 0 + ;; + -i|--install) + install_package=true + shift ;; + -d|--dependencies) + install_dependencies=true + shift ;; + -c|--clients) + build_clients=true + shift ;; + -g|--debug) + build_release=false + shift ;; + --cuda) + build_cuda=true + shift ;; + --) shift ; break ;; + *) echo "Unexpected command line parameter received; aborting"; + exit 1 + ;; + esac +done + +build_dir=./build +printf "\033[32mCreating project build directory in: \033[33m${build_dir}\033[0m\n" + +# ################################################# +# prep +# ################################################# +# ensure a clean build environment +if [[ "${build_release}" == true ]]; then + rm -rf ${build_dir}/release +else + rm -rf ${build_dir}/debug +fi + +# ################################################# +# install build dependencies on request +# ################################################# +if [[ "${install_dependencies}" == true ]]; then + # dependencies needed for rocsparse and clients to build + library_dependencies_ubuntu=( "make" "cmake-curses-gui" "hip_hcc" "pkg-config" ) + if [[ "${build_cuda}" == false ]]; then + library_dependencies_ubuntu+=( "hcc" ) + else + # Ideally, this could be cuda-cusparse-dev, but the package name has a version number in it + library_dependencies_ubuntu+=( "cuda" ) + fi + + client_dependencies_ubuntu=( "libboost-program-options-dev" ) + + elevate_if_not_root apt update + + # Dependencies required by main library + for package in "${library_dependencies_ubuntu[@]}"; do + if [[ $(dpkg-query --show --showformat='${db:Status-Abbrev}\n' ${package} 2> /dev/null | grep -q "ii"; echo $?) -ne 0 ]]; then + printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" + elevate_if_not_root apt install -y --no-install-recommends ${package} + fi + done + + # Dependencies required by library client apps + if [[ "${build_clients}" == true ]]; then + for package in "${client_dependencies_ubuntu[@]}"; do + if [[ $(dpkg-query --show --showformat='${db:Status-Abbrev}\n' ${package} 2> /dev/null | grep -q "ii"; echo $?) -ne 0 ]]; then + printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" + elevate_if_not_root apt install -y --no-install-recommends ${package} + fi + done + + # The following builds googletest from source + pushd . + printf "\033[32mBuilding \033[33mgoogletest\033[32m from source" + mkdir -p ${build_dir}/deps && cd ${build_dir}/deps + cmake -DBUILD_BOOST=OFF -DCMAKE_INSTALL_PREFIX=deps-install ../../deps + make -j$(nproc) + # elevate_if_not_root make install + make install + popd + fi + +fi + +export PATH=${PATH}:/opt/rocm/bin + +pushd . + # ################################################# + # configure & build + # ################################################# + cmake_common_options="" + cmake_client_options="" + + # build type + if [[ "${build_release}" == true ]]; then + mkdir -p ${build_dir}/release/clients && cd ${build_dir}/release + cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Release" + else + mkdir -p ${build_dir}/debug/clients && cd ${build_dir}/debug + cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Debug" + fi + + # clients + if [[ "${build_clients}" == true ]]; then + cmake_client_options="${cmake_client_options} -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCHMARKS=ON -DBUILD_CLIENTS_SELFTEST=ON -DBUILD_CLIENTS_RIDER=ON" + fi + + # On ROCm platforms, hcc compiler can build everything + if [[ "${build_cuda}" == false ]]; then + CXX=hcc cmake ${cmake_common_options} ${cmake_client_options} -DCMAKE_PREFIX_PATH="$(pwd)/../deps/deps-install" ../.. + make -j$(nproc) + else + # The nvidia compile is a little more complicated, in that we split compiling the library from the clients + # We use the hipcc compiler to build the rocsparse library for a cuda backend (hipcc offloads the compile to nvcc) + # However, we run into a compiler incompatibility compiling the clients between nvcc and sparsew3.h 3.3.4 headers. + # The incompatibility is fixed in sparse v3.3.6, but that is not shipped by default on Ubuntu + # As a workaround, since clients do not contain device code, we opt to build clients with the native + # compiler on the platform. The compiler cmake chooses during configuration time is mostly unchangeable, + # so we launch multiple cmake invocation with a different compiler on each. + + # Build library only with hipcc as compiler + CXX=hipcc cmake ${cmake_common_options} -DCMAKE_INSTALL_PREFIX=rocsparse-install -DCPACK_PACKAGE_INSTALL_DIRECTORY=/opt/rocm ../.. + make -j$(nproc) install + + # Build cuda clients with default host compiler + if [[ "${build_clients}" == true ]]; then + pushd clients + cmake ${cmake_common_options} ${cmake_client_options} -DCMAKE_PREFIX_PATH="$(pwd)/../rocsparse-install;$(pwd)/../deps/deps-install" ../../../clients + make -j$(nproc) + popd + fi + fi + + # ################################################# + # install + # ################################################# + # installing through package manager, which makes uninstalling easy + if [[ "${install_package}" == true ]]; then + make package + elevate_if_not_root dpkg -i rocsparse-*.deb + fi +popd From 103b851adfa846afe6f783cdff24551f67585f82 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 13:09:55 +0200 Subject: [PATCH 048/304] Jenkinsfile update --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 67bf213d..a914fd34 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -175,7 +175,7 @@ Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, // archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true // // archiveArtifacts artifacts: "${docker_context}/*.rpm", fingerprint: true // } - } +// } } return true From 5a15f1c89449a0ce7f2fefa99871d95f86684ea1 Mon Sep 17 00:00:00 2001 From: Nico <31079890+ntrost57@users.noreply.github.com> Date: Fri, 11 May 2018 13:25:05 +0200 Subject: [PATCH 049/304] axpyi test fix (#5) From 324dddf4f3a4c8ac3a00ed65530b6a60fb6f56db Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 13:29:21 +0200 Subject: [PATCH 050/304] Jenkinsfile update --- Jenkinsfile | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index a914fd34..1c805e20 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -16,6 +16,18 @@ properties([buildDiscarder(logRotator( // import hudson.FilePath; import java.nio.file.Path; +//////////////////////////////////////////////////////////////////////// +// Calculate the relative path between two sub-directories from a common root +@NonCPS +String g_relativize( String root_string, String rel_source, String rel_build ) +{ + Path root_path = new File( root_string ).toPath( ) + Path path_src = root_path.resolve( rel_source ) + Path path_build = root_path.resolve( rel_build ) + + return path_build.relativize( path_src ).toString( ) +} + //////////////////////////////////////////////////////////////////////// // Construct the relative path of the build directory void build_directory_rel( project_paths paths, compiler_data hcc_args ) From ec7f832e425bebc39e51eff3b03f21cf58ed9ce0 Mon Sep 17 00:00:00 2001 From: Nico <31079890+ntrost57@users.noreply.github.com> Date: Fri, 11 May 2018 13:30:36 +0200 Subject: [PATCH 051/304] Jenkinsfile update --- Jenkinsfile | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index a914fd34..1c805e20 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -16,6 +16,18 @@ properties([buildDiscarder(logRotator( // import hudson.FilePath; import java.nio.file.Path; +//////////////////////////////////////////////////////////////////////// +// Calculate the relative path between two sub-directories from a common root +@NonCPS +String g_relativize( String root_string, String rel_source, String rel_build ) +{ + Path root_path = new File( root_string ).toPath( ) + Path path_src = root_path.resolve( rel_source ) + Path path_build = root_path.resolve( rel_build ) + + return path_build.relativize( path_src ).toString( ) +} + //////////////////////////////////////////////////////////////////////// // Construct the relative path of the build directory void build_directory_rel( project_paths paths, compiler_data hcc_args ) From 8af2496c64d831e0514436f9edd435877a972ed3 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 13:37:47 +0200 Subject: [PATCH 052/304] Jenkinsfile update --- Jenkinsfile | 104 ++++++++++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1c805e20..51612808 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -154,40 +154,40 @@ Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, """ } -// stage( "Test ${compiler_args.compiler_name} ${compiler_args.build_config}" ) -// { -// // Cap the maximum amount of testing to be a few hours; assume failure if the time limit is hit -// timeout(time: 1, unit: 'HOURS') -// { -// sh """#!/usr/bin/env bash -// set -x -// cd ${paths.project_build_prefix}/build/release/clients/staging -// ./rocsparse-test${build_type_postfix} --gtest_output=xml --gtest_color=yes -// """ -// junit "${paths.project_build_prefix}/build/release/clients/staging/*.xml" -// } -// -// String docker_context = "${compiler_args.build_config}/${compiler_args.compiler_name}" -// if( compiler_args.compiler_name.toLowerCase( ).startsWith( 'hcc-' ) ) -// { -// sh """#!/usr/bin/env bash -// set -x -// cd ${paths.project_build_prefix}/build/release -// make package -// """ -// -// sh """#!/usr/bin/env bash -// set -x -// rm -rf ${docker_context} && mkdir -p ${docker_context} -// mv ${paths.project_build_prefix}/build/release/*.deb ${docker_context} -// # mv ${paths.project_build_prefix}/build/release/*.rpm ${docker_context} -// dpkg -c ${docker_context}/*.deb -// """ -// -// archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true -// // archiveArtifacts artifacts: "${docker_context}/*.rpm", fingerprint: true -// } -// } + stage( "Test ${compiler_args.compiler_name} ${compiler_args.build_config}" ) + { + // Cap the maximum amount of testing to be a few hours; assume failure if the time limit is hit + timeout(time: 1, unit: 'HOURS') + { + sh """#!/usr/bin/env bash + set -x + cd ${paths.project_build_prefix}/build/release/clients/tests + ./rocsparse-test${build_type_postfix} --gtest_output=xml --gtest_color=yes + """ + junit "${paths.project_build_prefix}/build/release/clients/tests/*.xml" + } + + String docker_context = "${compiler_args.build_config}/${compiler_args.compiler_name}" + if( compiler_args.compiler_name.toLowerCase( ).startsWith( 'hcc-' ) ) + { + sh """#!/usr/bin/env bash + set -x + cd ${paths.project_build_prefix}/build/release + make package + """ + + sh """#!/usr/bin/env bash + set -x + rm -rf ${docker_context} && mkdir -p ${docker_context} + mv ${paths.project_build_prefix}/build/release/*.deb ${docker_context} + # mv ${paths.project_build_prefix}/build/release/*.rpm ${docker_context} + dpkg -c ${docker_context}/*.deb + """ + + archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true + // archiveArtifacts artifacts: "${docker_context}/*.rpm", fingerprint: true + } + } } return true @@ -203,24 +203,24 @@ String docker_test_install( compiler_data compiler_args, docker_data docker_args String image_name = "rocsparse-hip-${compiler_args.compiler_name}-ubuntu-16.04" String docker_context = "${compiler_args.build_config}/${compiler_args.compiler_name}" -// stage( "Artifactory ${compiler_args.compiler_name} ${compiler_args.build_config}" ) -// { -// // We copy the docker files into the bin directory where the .deb lives so that it's a clean build everytime -// sh """#!/usr/bin/env bash -// set -x -// mkdir -p ${docker_context} -// cp -r ${rocsparse_paths.project_src_prefix}/docker/* ${docker_context} -// """ -// -// // Docker 17.05 introduced the ability to use ARG values in FROM statements -// // Docker inspect failing on FROM statements with ARG https://issues.jenkins-ci.org/browse/JENKINS-44836 -// // rocsparse_install_image = docker.build( "${job_name}/${image_name}:${env.BUILD_NUMBER}", "--pull -f ${build_dir_rel}/dockerfile-rocsparse-ubuntu-16.04 --build-arg base_image=${from_image} ${build_dir_rel}" ) -// -// // JENKINS-44836 workaround by using a bash script instead of docker.build() -// sh """docker build -t ${job_name}/${image_name} --pull -f ${docker_context}/${docker_args.install_docker_file} \ -// --build-arg base_image=${docker_args.from_image} ${docker_context}""" -// rocsparse_install_image = docker.image( "${job_name}/${image_name}" ) -// } + stage( "Artifactory ${compiler_args.compiler_name} ${compiler_args.build_config}" ) + { + // We copy the docker files into the bin directory where the .deb lives so that it's a clean build everytime + sh """#!/usr/bin/env bash + set -x + mkdir -p ${docker_context} + cp -r ${rocsparse_paths.project_src_prefix}/docker/* ${docker_context} + """ + + // Docker 17.05 introduced the ability to use ARG values in FROM statements + // Docker inspect failing on FROM statements with ARG https://issues.jenkins-ci.org/browse/JENKINS-44836 + // rocsparse_install_image = docker.build( "${job_name}/${image_name}:${env.BUILD_NUMBER}", "--pull -f ${build_dir_rel}/dockerfile-rocsparse-ubuntu-16.04 --build-arg base_image=${from_image} ${build_dir_rel}" ) + + // JENKINS-44836 workaround by using a bash script instead of docker.build() + sh """docker build -t ${job_name}/${image_name} --pull -f ${docker_context}/${docker_args.install_docker_file} \ + --build-arg base_image=${docker_args.from_image} ${docker_context}""" + rocsparse_install_image = docker.image( "${job_name}/${image_name}" ) + } return image_name } From f2d887c0a1069674d6cac773935607ae4bc75add Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 14:12:54 +0200 Subject: [PATCH 053/304] docker install file and jenkinsfile update --- Jenkinsfile | 2 +- docker/dockerfile-install-ubuntu-16.04 | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 docker/dockerfile-install-ubuntu-16.04 diff --git a/Jenkinsfile b/Jenkinsfile index 51612808..1c38eef3 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -300,7 +300,7 @@ hcc_rocm: def hcc_docker_args = new docker_data( from_image:'rocm/dev-ubuntu-16.04:1.7.1', build_docker_file:'dockerfile-build-ubuntu-16.04', - install_docker_file:'dockerfile-rocsparse-ubuntu-16.04', + install_docker_file:'dockerfile-install-ubuntu-16.04', docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', docker_build_args:' --pull' ) diff --git a/docker/dockerfile-install-ubuntu-16.04 b/docker/dockerfile-install-ubuntu-16.04 new file mode 100644 index 00000000..56ee71fb --- /dev/null +++ b/docker/dockerfile-install-ubuntu-16.04 @@ -0,0 +1,15 @@ +# Parameters related to building rocsparse +ARG base_image + +FROM ${base_image} +MAINTAINER Nico Trost + +# Copy the debian package of rocsparse into the container from host +COPY *.deb /tmp/ + +# Install the debian package +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends --allow-unauthenticated -y \ + /tmp/rocsparse-*.deb \ + && rm -f /tmp/*.deb \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* From b0c726d6c373723421654030e9249c90f12bebd6 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 14:43:17 +0200 Subject: [PATCH 054/304] Jenkinsfile update --- Jenkinsfile | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 1c38eef3..fb65dcd9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -186,6 +186,21 @@ Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true // archiveArtifacts artifacts: "${docker_context}/*.rpm", fingerprint: true + + stage('Clang Format') + { + sh ''' + find . -iname \'*.h\' \ + -o -iname \'*.hpp\' \ + -o -iname \'*.cpp\' \ + -o -iname \'*.h.in\' \ + -o -iname \'*.hpp.in\' \ + -o -iname \'*.cpp.in\' \ + | grep -v 'build/' \ + | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' + ''' + } + } } } @@ -293,6 +308,40 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec } } +// The following launches 3 builds in parallel: hcc-ctu, hcc-1.6 and cuda +parallel hcc_ctu: +{ + node( 'docker && rocm && dkms' ) + { + def docker_args = new docker_data( + from_image:'compute-artifactory:5001/rocm-developer-tools/hip/master/hip-hcc-ctu-ubuntu-16.04:latest', + build_docker_file:'dockerfile-build-ubuntu-16.04', + install_docker_file:'dockerfile-install-ubuntu-16.04', + docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', + docker_build_args:' --pull' ) + + def compiler_args = new compiler_data( + compiler_name:'hcc-ctu', + build_config:'Release', + compiler_path:'/opt/rocm/bin/hcc' ) + + def rocsparse_paths = new project_paths( + project_name:'rocsparse-hcc-ctu', + src_prefix:'src', + build_prefix:'src', + build_command: './install.sh -cd' ) + + def print_version_closure = { + sh """ + set -x + /opt/rocm/bin/rocm_agent_enumerator -t ALL + /opt/rocm/bin/hcc --version + """ + } + + build_pipeline( compiler_args, docker_args, rocsparse_paths, print_version_closure ) + } +}, hcc_rocm: { node( 'docker && rocm && dkms' ) From 0949268d596b12f5f5b971b04fc9a8f0db1f5b6c Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 11 May 2018 15:09:01 +0200 Subject: [PATCH 055/304] Jenkinsfile update --- Jenkinsfile | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index fb65dcd9..477f5f09 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,11 +1,13 @@ #!/usr/bin/env groovy // Generated from snippet generator 'properties; set job properties' -properties([buildDiscarder(logRotator( - artifactDaysToKeepStr: '', - artifactNumToKeepStr: '', - daysToKeepStr: '', - numToKeepStr: '10')), +properties([ + pipelineTriggers([cron('0 3 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]), + buildDiscarder(logRotator( + artifactDaysToKeepStr: '', + artifactNumToKeepStr: '', + daysToKeepStr: '', + numToKeepStr: '10')), disableConcurrentBuilds(), // parameters([booleanParam( name: 'push_image_to_docker_hub', defaultValue: false, description: 'Push rocsparse image to rocm docker-hub' )]), [$class: 'CopyArtifactPermissionProperty', projectNames: '*'] @@ -187,19 +189,19 @@ Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true // archiveArtifacts artifacts: "${docker_context}/*.rpm", fingerprint: true - stage('Clang Format') - { - sh ''' - find . -iname \'*.h\' \ - -o -iname \'*.hpp\' \ - -o -iname \'*.cpp\' \ - -o -iname \'*.h.in\' \ - -o -iname \'*.hpp.in\' \ - -o -iname \'*.cpp.in\' \ - | grep -v 'build/' \ - | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' - ''' - } +// stage('Clang Format') +// { +// sh ''' +// find . -iname \'*.h\' \ +// -o -iname \'*.hpp\' \ +// -o -iname \'*.cpp\' \ +// -o -iname \'*.h.in\' \ +// -o -iname \'*.hpp.in\' \ +// -o -iname \'*.cpp.in\' \ +// | grep -v 'build/' \ +// | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' +// ''' +// } } } From 569c94e496901bb8e75d6296ae891584bce04700 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 14 May 2018 07:06:29 +0200 Subject: [PATCH 056/304] should catch exception by reference --- clients/benchmarks/client.cpp | 2 +- library/src/rocsparse_auxiliary.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index baeb6a4f..467d8e0a 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -23,7 +23,7 @@ int main(int argc, char *argv[]) argus.timing = 1; std::string function; - char precision; + char precision = 's'; rocsparse_int device_id; diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index 6c0468d7..caae10e8 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -32,7 +32,7 @@ rocsparse_status rocsparse_create_handle(rocsparse_handle *handle) log_trace(*handle, "rocsparse_create_handle"); } - catch(rocsparse_status status) + catch(const rocsparse_status &status) { return status; } @@ -52,7 +52,7 @@ rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle) { delete handle; } - catch(rocsparse_status status) + catch(const rocsparse_status &status) { return status; } @@ -169,7 +169,7 @@ rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descr) { *descr = new _rocsparse_mat_descr; } - catch(rocsparse_status status) + catch(const rocsparse_status &status) { return status; } @@ -188,7 +188,7 @@ rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr) { delete descr; } - catch(rocsparse_status status) + catch(const rocsparse_status &status) { return status; } @@ -287,7 +287,7 @@ rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat *hyb) { *hyb = new _rocsparse_hyb_mat; } - catch(rocsparse_status status) + catch(const rocsparse_status &status) { return status; } @@ -306,7 +306,7 @@ rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb) { delete hyb; } - catch(rocsparse_status status) + catch(const rocsparse_status &status) { return status; } From e6b02c26c07d8737fa7c15bd635a69881ce32ec5 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 14 May 2018 08:39:41 +0200 Subject: [PATCH 057/304] testing csrmv: now using sparse matrix generator --- clients/benchmarks/client.cpp | 4 +- clients/include/testing_csrmv.hpp | 257 +++++++++++++++++------------- clients/include/utility.hpp | 2 +- clients/tests/test_csrmv.cpp | 24 +-- 4 files changed, 161 insertions(+), 126 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 467d8e0a..b072b748 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -120,9 +120,9 @@ int main(int argc, char *argv[]) else if (function == "csrmv") { if (precision == 's') - testing_csrmv(argus); + testing_csrmv(argus); else if (precision == 'd') - testing_csrmv(argus); + testing_csrmv(argus); } else if (function == "csr2coo") { diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index 06f1ce62..f8fcc7a7 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -14,22 +14,19 @@ #include #include -typedef rocsparse_operation op; - using namespace rocsparse; using namespace rocsparse_test; -template +template void testing_csrmv_bad_arg(void) { - I n = 100; - I m = 100; - I nnz = 100; - I safe_size = 100; - T alpha = 0.6; - T beta = 0.2; - - op trans = rocsparse_operation_none; + rocsparse_int n = 100; + rocsparse_int m = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + T alpha = 0.6; + T beta = 0.2; + rocsparse_operation trans = rocsparse_operation_none; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); @@ -38,19 +35,24 @@ void testing_csrmv_bad_arg(void) std::unique_ptr unique_ptr_descr(new descr_struct); rocsparse_mat_descr descr = unique_ptr_descr->descr; - auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), - device_free}; - auto dcol_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), - device_free}; - auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), - device_free}; - auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), - device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), - device_free}; - - I *dptr = (I*) dptr_managed.get(); - I *dcol = (I*) dcol_managed.get(); + auto dptr_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + auto dcol_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + auto dval_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*safe_size), + device_free}; + auto dx_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*safe_size), + device_free}; + auto dy_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*safe_size), + device_free}; + + rocsparse_int *dptr = (rocsparse_int*) dptr_managed.get(); + rocsparse_int *dcol = (rocsparse_int*) dcol_managed.get(); T *dval = (T*) dval_managed.get(); T *dx = (T*) dx_managed.get(); T *dy = (T*) dy_managed.get(); @@ -63,14 +65,14 @@ void testing_csrmv_bad_arg(void) // testing for (nullptr == dptr) { - I *dptr_null = nullptr; + rocsparse_int *dptr_null = nullptr; status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, dval, dptr_null, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); } // testing for (nullptr == dcol) { - I *dcol_null = nullptr; + rocsparse_int *dcol_null = nullptr; status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol_null, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); @@ -126,16 +128,16 @@ void testing_csrmv_bad_arg(void) } } -template +template rocsparse_status testing_csrmv(Arguments argus) { - I safe_size = 100; - I m = argus.M; - I n = argus.N; - I nnz = argus.nnz == 32 ? m * 0.02 * n : argus.nnz; // 2% non zeros - T h_alpha = argus.alpha; - T h_beta = argus.beta; - op trans = argus.trans; + rocsparse_int safe_size = 100; + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + T h_alpha = argus.alpha; + T h_beta = argus.beta; + rocsparse_operation trans = argus.trans; + rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; std::unique_ptr test_handle(new handle_struct); @@ -144,22 +146,35 @@ rocsparse_status testing_csrmv(Arguments argus) std::unique_ptr test_descr(new descr_struct); rocsparse_mat_descr descr = test_descr->descr; + // Determine number of non-zero elements + double scale = 0.02; + if (m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + // Argument sanity check before allocating invalid memory if(m <= 0 || n <= 0 || nnz <= 0) { - auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), - device_free}; - auto dcol_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), - device_free}; - auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), - device_free}; - auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), - device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), - device_free}; - - I *dptr = (I*) dptr_managed.get(); - I *dcol = (I*) dcol_managed.get(); + auto dptr_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + auto dcol_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int)*safe_size), + device_free}; + auto dval_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*safe_size), + device_free}; + auto dx_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*safe_size), + device_free}; + auto dy_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*safe_size), + device_free}; + + rocsparse_int *dptr = (rocsparse_int*) dptr_managed.get(); + rocsparse_int *dcol = (rocsparse_int*) dcol_managed.get(); T *dval = (T*) dval_managed.get(); T *dx = (T*) dx_managed.get(); T *dy = (T*) dy_managed.get(); @@ -189,47 +204,34 @@ rocsparse_status testing_csrmv(Arguments argus) return rocsparse_status_success; } - // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - std::vector hptr(m+1); - std::vector hcol(nnz); - std::vector hval(nnz); - std::vector hx(n); - std::vector hy_1(m); - std::vector hy_2(m); - std::vector hy_gold(m); + // Host structures + std::vector hcoo_row_ind; + std::vector hcoo_col_ind; + std::vector hcoo_val; // Initial Data on CPU srand(12345ULL); if (argus.filename != "") { - std::vector coo_row; - std::vector coo_col; - std::vector coo_val; - if (read_mtx_matrix(argus.filename.c_str(), m, n, nnz, - coo_row, coo_col, coo_val) != 0) + hcoo_row_ind, hcoo_col_ind, hcoo_val) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; } - - coo_to_csr(m, n, nnz, - coo_row, coo_col, coo_val, - hptr, hcol, hval); - coo_row.clear(); - coo_col.clear(); - coo_val.clear(); - hx.resize(n); - hy_1.resize(m); - hy_2.resize(m); - hy_gold.resize(m); } else { - rocsparse_init_csr(hptr, hcol, hval, m, n, nnz); + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base); } + std::vector hcsr_row_ptr(m+1); + std::vector hx(n); + std::vector hy_1(m); + std::vector hy_2(m); + std::vector hy_gold(m); + rocsparse_init(hx, 1, n); rocsparse_init(hy_1, 1, m); @@ -238,25 +240,28 @@ rocsparse_status testing_csrmv(Arguments argus) hy_gold = hy_1; // allocate memory on device - auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*(m+1)), - device_free}; - auto dcol_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*nnz), - device_free}; - auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nnz), - device_free}; - auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*n), - device_free}; - auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*m), - device_free}; - auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*m), - device_free}; - auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), - device_free}; - auto d_beta_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), - device_free}; - - I *dptr = (I*) dptr_managed.get(); - I *dcol = (I*) dcol_managed.get(); + auto dptr_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int)*(m+1)), device_free}; + auto drow_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int)*nnz), device_free}; + auto dcol_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int)*nnz), device_free}; + auto dval_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*nnz), device_free}; + auto dx_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*n), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*m), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*m), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)), device_free}; + auto d_beta_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)), device_free}; + + rocsparse_int *dptr = (rocsparse_int*) dptr_managed.get(); + rocsparse_int *drow = (rocsparse_int*) drow_managed.get(); + rocsparse_int *dcol = (rocsparse_int*) dcol_managed.get(); T *dval = (T*) dval_managed.get(); T *dx = (T*) dx_managed.get(); T *dy_1 = (T*) dy_1_managed.get(); @@ -264,24 +269,51 @@ rocsparse_status testing_csrmv(Arguments argus) T *d_alpha = (T*) d_alpha_managed.get(); T *d_beta = (T*) d_beta_managed.get(); - if(!dval || !dptr || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta) + if(!dval || !dptr || !dcol || !drow || !dx || + !dy_1 || !dy_2 || !d_alpha || !d_beta) { verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dval || !dptr || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta"); + "!dval || !dptr || !dcol || !drow || !dx || " + "!dy_1 || !dy_2 || !d_alpha || !d_beta"); return rocsparse_status_memory_error; } // copy data from CPU to device - CHECK_HIP_ERROR(hipMemcpy(dptr, hptr.data(), sizeof(I)*(m+1), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dcol, hcol.data(), sizeof(I)*nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dval, hval.data(), sizeof(T)*nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T)*n, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T)*m, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); - - double gpu_time_used, cpu_time_used; - double rocsparse_gflops, cpu_gflops, rocsparse_bandwidth; + CHECK_HIP_ERROR(hipMemcpy(drow, + hcoo_row_ind.data(), + sizeof(rocsparse_int)*(m+1), + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcol, + hcoo_col_ind.data(), + sizeof(rocsparse_int)*nnz, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dval, + hcoo_val.data(), + sizeof(T)*nnz, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, + hx.data(), + sizeof(T)*n, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, + hy_1.data(), + sizeof(T)*m, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, + &h_alpha, + sizeof(T), + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, + &h_beta, + sizeof(T), + hipMemcpyHostToDevice)); + + // Convert COO to CSR + CHECK_ROCSPARSE_ERROR(rocsparse_coo2csr(handle, drow, nnz, m, dptr, idx_base)); + CHECK_HIP_ERROR(hipMemcpy(hcsr_row_ptr.data(), + dptr, + sizeof(rocsparse_int)*(m+1), + hipMemcpyDeviceToHost)); if(argus.unit_check) { @@ -302,19 +334,18 @@ rocsparse_status testing_csrmv(Arguments argus) CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T)*m, hipMemcpyDeviceToHost)); // CPU - cpu_time_used = get_time_us(); + double cpu_time_used = get_time_us(); for (rocsparse_int i=0; i();//(double) rand() / RAND_MAX; } } diff --git a/clients/tests/test_csrmv.cpp b/clients/tests/test_csrmv.cpp index ad2c5a97..72c972c6 100644 --- a/clients/tests/test_csrmv.cpp +++ b/clients/tests/test_csrmv.cpp @@ -9,12 +9,14 @@ #include #include -typedef std::tuple csrmv_tuple; +typedef rocsparse_index_base base; +typedef std::tuple csrmv_tuple; int csr_M_range[] = {-1, 0, 10, 500, 7111, 10000}; int csr_N_range[] = {-3, 0, 33, 842, 4441, 10000}; std::vector csr_alpha_range = {2.0, 3.0}; std::vector csr_beta_range = {0.0, 1.0}; +base csr_idxbase_range[] = {rocsparse_index_base_zero}; class parameterized_csrmv : public testing::TestWithParam { @@ -28,30 +30,31 @@ class parameterized_csrmv : public testing::TestWithParam Arguments setup_csrmv_arguments(csrmv_tuple tup) { Arguments arg; - arg.M = std::get<0>(tup); - arg.N = std::get<1>(tup); - arg.alpha = std::get<2>(tup); - arg.beta = std::get<3>(tup); - arg.timing = 0; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.beta = std::get<3>(tup); + arg.idx_base = std::get<4>(tup); + arg.timing = 0; return arg; } TEST(csrmv_bad_arg, csrmv_float) { - testing_csrmv_bad_arg(); + testing_csrmv_bad_arg(); } TEST_P(parameterized_csrmv, csrmv_float) { Arguments arg = setup_csrmv_arguments(GetParam()); - rocsparse_status status = testing_csrmv(arg); + rocsparse_status status = testing_csrmv(arg); EXPECT_EQ(status, rocsparse_status_success); } TEST_P(parameterized_csrmv, csrmv_double) { Arguments arg = setup_csrmv_arguments(GetParam()); - rocsparse_status status = testing_csrmv(arg); + rocsparse_status status = testing_csrmv(arg); EXPECT_EQ(status, rocsparse_status_success); } @@ -59,4 +62,5 @@ INSTANTIATE_TEST_CASE_P(csrmv, parameterized_csrmv, testing::Combine(testing::ValuesIn(csr_M_range), testing::ValuesIn(csr_N_range), testing::ValuesIn(csr_alpha_range), - testing::ValuesIn(csr_beta_range))); + testing::ValuesIn(csr_beta_range), + testing::ValuesIn(csr_idxbase_range))); From cb131f574ecc04cdd93be45b35d2512e2eb52878 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 14 May 2018 09:20:57 +0200 Subject: [PATCH 058/304] test/benchmark csrmv: added laplacian matrix generator --- clients/benchmarks/client.cpp | 11 +- clients/include/testing_axpyi.hpp | 167 +++++++++++++++++------------- clients/include/testing_csrmv.hpp | 73 +++++++------ clients/include/utility.hpp | 21 ++-- clients/samples/example_csrmv.cpp | 2 +- clients/samples/example_ellmv.cpp | 2 +- clients/tests/test_axpyi.cpp | 6 +- 7 files changed, 163 insertions(+), 119 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index b072b748..8b790b07 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -46,7 +46,12 @@ int main(int argc, char *argv[]) ("mtx", po::value(&argus.filename)->default_value(""), "read from matrix " - "market (.mtx) format. This will override parameters m, n, and z") + "market (.mtx) format. This will override parameters m, n, and z.") + + ("laplacian-dim", + po::value(&argus.laplacian)->default_value(0), "assemble " + "laplacian matrix for 2D unit square with dimension . This will override " + "parameters m, n, z and mtx.") ("alpha", po::value(&argus.alpha)->default_value(1.0), "specifies the scalar alpha") @@ -113,9 +118,9 @@ int main(int argc, char *argv[]) if (function == "axpyi") { if (precision == 's') - testing_axpyi(argus); + testing_axpyi(argus); else if (precision == 'd') - testing_axpyi(argus); + testing_axpyi(argus); } else if (function == "csrmv") { diff --git a/clients/include/testing_axpyi.hpp b/clients/include/testing_axpyi.hpp index 051d229c..f27346ad 100644 --- a/clients/include/testing_axpyi.hpp +++ b/clients/include/testing_axpyi.hpp @@ -13,33 +13,31 @@ #include -typedef rocsparse_index_base base; - using namespace rocsparse; using namespace rocsparse_test; -template +template void testing_axpyi_bad_arg(void) { - I nnz = 100; - I safe_size = 100; - T alpha = 0.6; - base idx_base = rocsparse_index_base_zero; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + T alpha = 0.6; + rocsparse_index_base idx_base = rocsparse_index_base_zero; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); rocsparse_handle handle = unique_ptr_handle->handle; - auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), - device_free}; - auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*safe_size), - device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*safe_size), - device_free}; + auto dxVal_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*safe_size), device_free}; + auto dxInd_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int)*safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*safe_size), device_free}; - T *dxVal = (T*) dxVal_managed.get(); - I *dxInd = (I*) dxInd_managed.get(); - T *dy = (T*) dy_managed.get(); + T *dxVal = (T*) dxVal_managed.get(); + rocsparse_int *dxInd = (rocsparse_int*) dxInd_managed.get(); + T *dy = (T*) dy_managed.get(); if(!dxInd || !dxVal || !dy) { @@ -49,7 +47,7 @@ void testing_axpyi_bad_arg(void) // testing for (nullptr == dxInd) { - I *dxInd_null = nullptr; + rocsparse_int *dxInd_null = nullptr; status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd_null, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: xInd is nullptr"); } @@ -79,12 +77,12 @@ void testing_axpyi_bad_arg(void) } } -template +template rocsparse_status testing_axpyi(Arguments argus) { - I N = argus.N; - I nnz = argus.nnz; - I safe_size = 100; + rocsparse_int N = argus.N; + rocsparse_int nnz = argus.nnz; + rocsparse_int safe_size = 100; T h_alpha = argus.alpha; rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; @@ -95,24 +93,26 @@ rocsparse_status testing_axpyi(Arguments argus) // Argument sanity check before allocating invalid memory if(nnz <= 0) { - auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I) * safe_size), - device_free}; - auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), - device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), - device_free}; + auto dxInd_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dxVal_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T) * safe_size), device_free}; - I *dxInd = (I*) dxInd_managed.get(); - T *dxVal = (T*) dxVal_managed.get(); - T *dy = (T*) dy_managed.get(); + rocsparse_int *dxInd = (rocsparse_int*) dxInd_managed.get(); + T *dxVal = (T*) dxVal_managed.get(); + T *dy = (T*) dy_managed.get(); if(!dxInd || !dxVal || !dy) { - verify_rocsparse_status_success(rocsparse_status_memory_error, "!dxInd || !dxVal || !dy"); + verify_rocsparse_status_success( + rocsparse_status_memory_error, "!dxInd || !dxVal || !dy"); return rocsparse_status_memory_error; } - CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR( + rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); status = rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy, idx_base); if (nnz < 0) @@ -127,8 +127,8 @@ rocsparse_status testing_axpyi(Arguments argus) return rocsparse_status_success; } - // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - std::vector hxInd(nnz); + // Host structures + std::vector hxInd(nnz); std::vector hxVal(nnz); std::vector hy_1(N); std::vector hy_2(N); @@ -145,22 +145,22 @@ rocsparse_status testing_axpyi(Arguments argus) hy_gold = hy_1; // allocate memory on device - auto dxInd_managed = rocsparse_unique_ptr{device_malloc(sizeof(I)*nnz), - device_free}; - auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*nnz), - device_free}; - auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*N), - device_free}; - auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)*N), - device_free}; - auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), - device_free}; - - I *dxInd = (I*) dxInd_managed.get(); - T *dxVal = (T*) dxVal_managed.get(); - T *dy_1 = (T*) dy_1_managed.get(); - T *dy_2 = (T*) dy_2_managed.get(); - T *d_alpha = (T*) d_alpha_managed.get(); + auto dxInd_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(rocsparse_int)*nnz), device_free}; + auto dxVal_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*nnz), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*N), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)*N), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{ + device_malloc(sizeof(T)), device_free}; + + rocsparse_int *dxInd = (rocsparse_int*) dxInd_managed.get(); + T *dxVal = (T*) dxVal_managed.get(); + T *dy_1 = (T*) dy_1_managed.get(); + T *dy_2 = (T*) dy_2_managed.get(); + T *d_alpha = (T*) d_alpha_managed.get(); if(!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha) { @@ -170,40 +170,57 @@ rocsparse_status testing_axpyi(Arguments argus) } // copy data from CPU to device - CHECK_HIP_ERROR(hipMemcpy(dxInd, hxInd.data(), sizeof(I) * nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dxVal, hxVal.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * N, hipMemcpyHostToDevice)); - - double gpu_time_used, cpu_time_used; - double rocsparse_gflops, cpu_gflops, rocsparse_bandwidth; + CHECK_HIP_ERROR(hipMemcpy(dxInd, + hxInd.data(), + sizeof(rocsparse_int)*nnz, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dxVal, + hxVal.data(), + sizeof(T)*nnz, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, + hy_1.data(), + sizeof(T)*N, + hipMemcpyHostToDevice)); if(argus.unit_check) { - CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * N, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_2, + hy_2.data(), + sizeof(T)*N, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, + &h_alpha, + sizeof(T), + hipMemcpyHostToDevice)); // ROCSPARSE pointer mode host - CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idx_base)); + CHECK_ROCSPARSE_ERROR( + rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR( + rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idx_base)); // ROCSPARSE pointer mode device - CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); - CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idx_base)); + CHECK_ROCSPARSE_ERROR( + rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR( + rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idx_base)); // copy output from device to CPU - CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * N, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * N, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(hy_1.data(), dy_1, sizeof(T)*N, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(hy_2.data(), dy_2, sizeof(T)*N, hipMemcpyDeviceToHost)); // CPU - cpu_time_used = get_time_us(); + double cpu_time_used = get_time_us(); - for (int i=0; i hcsr_row_ptr; std::vector hcoo_row_ind; - std::vector hcoo_col_ind; - std::vector hcoo_val; + std::vector hcol_ind; + std::vector hval; // Initial Data on CPU srand(12345ULL); - if (argus.filename != "") + if (argus.laplacian) { - if (read_mtx_matrix(argus.filename.c_str(), - m, n, nnz, - hcoo_row_ind, hcoo_col_ind, hcoo_val) != 0) - { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); - return rocsparse_status_internal_error; - } + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, + hcol_ind, hval, idx_base); + nnz = hcsr_row_ptr[m]; } else { - gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base); + if (argus.filename != "") + { + if (read_mtx_matrix(argus.filename.c_str(), + m, n, nnz, + hcoo_row_ind, hcol_ind, hval) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base); + } + + // Convert COO to CSR + if (!argus.laplacian) + { + hcsr_row_ptr.resize(m+1, 0); + for (int i=0; i hcsr_row_ptr(m+1); std::vector hx(n); std::vector hy_1(m); std::vector hy_2(m); @@ -242,8 +267,6 @@ rocsparse_status testing_csrmv(Arguments argus) // allocate memory on device auto dptr_managed = rocsparse_unique_ptr{ device_malloc(sizeof(rocsparse_int)*(m+1)), device_free}; - auto drow_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(rocsparse_int)*nnz), device_free}; auto dcol_managed = rocsparse_unique_ptr{ device_malloc(sizeof(rocsparse_int)*nnz), device_free}; auto dval_managed = rocsparse_unique_ptr{ @@ -260,7 +283,6 @@ rocsparse_status testing_csrmv(Arguments argus) device_malloc(sizeof(T)), device_free}; rocsparse_int *dptr = (rocsparse_int*) dptr_managed.get(); - rocsparse_int *drow = (rocsparse_int*) drow_managed.get(); rocsparse_int *dcol = (rocsparse_int*) dcol_managed.get(); T *dval = (T*) dval_managed.get(); T *dx = (T*) dx_managed.get(); @@ -269,26 +291,26 @@ rocsparse_status testing_csrmv(Arguments argus) T *d_alpha = (T*) d_alpha_managed.get(); T *d_beta = (T*) d_beta_managed.get(); - if(!dval || !dptr || !dcol || !drow || !dx || + if(!dval || !dptr || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta) { verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dval || !dptr || !dcol || !drow || !dx || " + "!dval || !dptr || !dcol || !dx || " "!dy_1 || !dy_2 || !d_alpha || !d_beta"); return rocsparse_status_memory_error; } // copy data from CPU to device - CHECK_HIP_ERROR(hipMemcpy(drow, - hcoo_row_ind.data(), + CHECK_HIP_ERROR(hipMemcpy(dptr, + hcsr_row_ptr.data(), sizeof(rocsparse_int)*(m+1), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dcol, - hcoo_col_ind.data(), + hcol_ind.data(), sizeof(rocsparse_int)*nnz, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dval, - hcoo_val.data(), + hval.data(), sizeof(T)*nnz, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dx, @@ -308,13 +330,6 @@ rocsparse_status testing_csrmv(Arguments argus) sizeof(T), hipMemcpyHostToDevice)); - // Convert COO to CSR - CHECK_ROCSPARSE_ERROR(rocsparse_coo2csr(handle, drow, nnz, m, dptr, idx_base)); - CHECK_HIP_ERROR(hipMemcpy(hcsr_row_ptr.data(), - dptr, - sizeof(rocsparse_int)*(m+1), - hipMemcpyDeviceToHost)); - if(argus.unit_check) { CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T)*m, hipMemcpyHostToDevice)); @@ -341,7 +356,7 @@ rocsparse_status testing_csrmv(Arguments argus) hy_gold[i] *= h_beta; for (rocsparse_int j=hcsr_row_ptr[i]; j rocsparse_int gen_2d_laplacian(rocsparse_int ndim, std::vector &rowptr, std::vector &col, - std::vector &val) + std::vector &val, + rocsparse_index_base idx_base) { if (ndim == 0) { return 0; @@ -173,42 +174,42 @@ rocsparse_int gen_2d_laplacian(rocsparse_int ndim, for (rocsparse_int j=0; j(-1); ++nnz; } // if no left boundary element, connect with left neighbor if (j != 0) { - col[nnz] = idx - 1; + col[nnz] = idx - 1 + idx_base; val[nnz] = static_cast(-1); ++nnz; } // element itself - col[nnz] = idx; + col[nnz] = idx + idx_base; val[nnz] = static_cast(4); ++nnz; // if no right boundary element, connect with right neighbor if (j != ndim - 1) { - col[nnz] = idx + 1; + col[nnz] = idx + 1 + idx_base; val[nnz] = static_cast(-1); ++nnz; } // if no lower boundary element, connect with lower neighbor if (i != ndim - 1) { - col[nnz] = idx + ndim; + col[nnz] = idx + ndim + idx_base; val[nnz] = static_cast(-1); ++nnz; } } } - rowptr[n] = nnz; + rowptr[n] = nnz + idx_base; return n; } @@ -561,6 +562,7 @@ class Arguments rocsparse_int timing = 0; rocsparse_int iters = 10; + rocsparse_int laplacian = 0; std::string filename = ""; @@ -580,6 +582,9 @@ class Arguments unit_check = rhs.unit_check; timing = rhs.timing; + iters = rhs.iters; + laplacian = rhs.laplacian; + filename = rhs.filename; return *this; diff --git a/clients/samples/example_csrmv.cpp b/clients/samples/example_csrmv.cpp index e659985e..293969ff 100644 --- a/clients/samples/example_csrmv.cpp +++ b/clients/samples/example_csrmv.cpp @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) std::vector hAptr; std::vector hAcol; std::vector hAval; - int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval); + int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); int n = m; int nnz = hAptr[m]; diff --git a/clients/samples/example_ellmv.cpp b/clients/samples/example_ellmv.cpp index e6c344ea..5d01c84d 100644 --- a/clients/samples/example_ellmv.cpp +++ b/clients/samples/example_ellmv.cpp @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) std::vector hAptr; std::vector hAcol; std::vector hAval; - int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval); + int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); int n = m; int nnz = hAptr[m]; diff --git a/clients/tests/test_axpyi.cpp b/clients/tests/test_axpyi.cpp index 20d58d34..7d694753 100644 --- a/clients/tests/test_axpyi.cpp +++ b/clients/tests/test_axpyi.cpp @@ -40,20 +40,20 @@ Arguments setup_axpyi_arguments(axpyi_tuple tup) TEST(axpyi_bad_arg, axpyi_float) { - testing_axpyi_bad_arg(); + testing_axpyi_bad_arg(); } TEST_P(parameterized_axpyi, axpyi_float) { Arguments arg = setup_axpyi_arguments(GetParam()); - rocsparse_status status = testing_axpyi(arg); + rocsparse_status status = testing_axpyi(arg); EXPECT_EQ(status, rocsparse_status_success); } TEST_P(parameterized_axpyi, axpyi_double) { Arguments arg = setup_axpyi_arguments(GetParam()); - rocsparse_status status = testing_axpyi(arg); + rocsparse_status status = testing_axpyi(arg); EXPECT_EQ(status, rocsparse_status_success); } From 6ad7e5991306aae323f28a6cdcd9766a4d6d2e9e Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 14 May 2018 11:35:46 +0200 Subject: [PATCH 059/304] style: formatting --- library/include/rocsparse-functions.h | 14 ++---- library/include/rocsparse-types.h | 12 ++--- library/include/rocsparse-version.h.in | 2 + library/src/conversion/csr2coo_device.h | 4 +- library/src/conversion/csr2hyb_device.h | 10 ++-- library/src/conversion/rocsparse_csr2hyb.cpp | 14 +++--- library/src/handle.cpp | 4 +- library/src/include/definitions.h | 51 ++++++++++---------- library/src/include/handle.h | 3 ++ library/src/rocsparse_auxiliary.cpp | 45 +++++++---------- library/src/status.cpp | 5 +- 11 files changed, 77 insertions(+), 87 deletions(-) diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index d88f10b5..5018915b 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -138,7 +138,7 @@ rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, @@ -153,7 +153,7 @@ rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, ROCSPARSE_EXPORT rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, @@ -168,7 +168,7 @@ rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, /* ROCSPARSE_EXPORT rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, @@ -183,7 +183,7 @@ rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, ROCSPARSE_EXPORT rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, @@ -273,12 +273,6 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, * =========================================================================== */ - - - - - - /* * =========================================================================== * Sparse Format Conversions diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index 36070fcc..cc300752 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -28,15 +28,15 @@ typedef struct _rocsparse_hyb_mat *rocsparse_hyb_mat; extern "C" { #endif -/* ============================================================================================ */ +/* ==================================================================================== */ /*! parameter constants. */ /*! \brief Used to specify whether the matrix is to be transposed or not. */ typedef enum rocsparse_operation_ { - rocsparse_operation_none = 111, /**< Operate with the matrix. */ - rocsparse_operation_transpose = 112, /**< Operate with the transpose of the matrix. */ - rocsparse_operation_conjugate_transpose = 113 /**< Operate with the conjugate transpose of the matrix. */ + rocsparse_operation_none = 111, /**< Operate with matrix. */ + rocsparse_operation_transpose = 112, /**< Operate with transpose. */ + rocsparse_operation_conjugate_transpose = 113 /**< Operate with conj. transpose. */ } rocsparse_operation; /*! \brief Used to specify the matrix index base. */ @@ -60,7 +60,7 @@ typedef enum rocsparse_hyb_partition_ { rocsparse_hyb_partition_max = 2 } rocsparse_hyb_partition; -/* ============================================================================================ */ +/* ==================================================================================== */ /** * @brief rocsparse status codes definition */ @@ -70,7 +70,7 @@ typedef enum rocsparse_status_ { rocsparse_status_not_implemented = 2, /**< function is not implemented */ rocsparse_status_invalid_pointer = 3, /**< invalid pointer parameter */ rocsparse_status_invalid_size = 4, /**< invalid size parameter */ - rocsparse_status_memory_error = 5, /**< failed internal memory allocation, copy or dealloc */ + rocsparse_status_memory_error = 5, /**< failed memory allocation, copy, dealloc */ rocsparse_status_internal_error = 6, /**< other internal library failure */ rocsparse_status_invalid_value = 7, /**< invalid value parameter */ rocsparse_status_arch_mismatch = 8 /**< device arch is not supported */ diff --git a/library/include/rocsparse-version.h.in b/library/include/rocsparse-version.h.in index ebfc8065..f6e424e4 100644 --- a/library/include/rocsparse-version.h.in +++ b/library/include/rocsparse-version.h.in @@ -10,9 +10,11 @@ #ifndef _ROCSPARSE_VERSION_H_ #define _ROCSPARSE_VERSION_H_ +// clang-format off #define ROCSPARSE_VERSION_MAJOR @rocsparse_VERSION_MAJOR@ #define ROCSPARSE_VERSION_MINOR @rocsparse_VERSION_MINOR@ #define ROCSPARSE_VERSION_PATCH @rocsparse_VERSION_PATCH@ #define ROCSPARSE_VERSION_TWEAK @rocsparse_VERSION_TWEAK@ +// clang-format on #endif // _ROCSPARSE_VERSION_H_ diff --git a/library/src/conversion/csr2coo_device.h b/library/src/conversion/csr2coo_device.h index 06ff4651..7752061a 100644 --- a/library/src/conversion/csr2coo_device.h +++ b/library/src/conversion/csr2coo_device.h @@ -20,9 +20,9 @@ void csr2coo_kernel(rocsparse_int m, rocsparse_int vid = gid / THREADS; rocsparse_int nvec = hipGridDim_x * hipBlockDim_x / THREADS; - for(rocsparse_int ai=vid; ai(0); + ell_col_ind[idx] = -1; + ell_val[idx] = static_cast(0); } } diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp index d97a98ab..710590fc 100644 --- a/library/src/conversion/rocsparse_csr2hyb.cpp +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -99,8 +99,8 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hipStream_t stream = handle->stream; // Clear HYB structure if already allocated - hyb->m = m; - hyb->n = n; + hyb->m = m; + hyb->n = n; hyb->partition = partition_type; hyb->ell_nnz = 0; hyb->ell_width = 0; @@ -128,7 +128,7 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, } #define CSR2ELL_DIM 512 - //TODO we take max partition + // TODO we take max partition if (partition_type == rocsparse_hyb_partition_max) { // ELL part only, compute maximum non-zeros per row @@ -136,8 +136,8 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, // Allocate workspace rocsparse_int *workspace = NULL; - RETURN_IF_HIP_ERROR(hipMalloc((void**) &workspace, - sizeof(rocsparse_int)*blocks)); + RETURN_IF_HIP_ERROR( + hipMalloc((void**) &workspace, sizeof(rocsparse_int)*blocks)); hipLaunchKernelGGL((ell_width_kernel_part1), dim3(blocks), dim3(CSR2ELL_DIM), 0, stream, @@ -164,8 +164,8 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hyb->ell_nnz = hyb->ell_width * m; // Allocate ELL part - RETURN_IF_HIP_ERROR(hipMalloc((void**) &hyb->ell_col_ind, - sizeof(rocsparse_int)*hyb->ell_nnz)); + RETURN_IF_HIP_ERROR( + hipMalloc((void**) &hyb->ell_col_ind, sizeof(rocsparse_int)*hyb->ell_nnz)); RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T)*hyb->ell_nnz)); dim3 csr2ell_blocks((m-1)/CSR2ELL_DIM+1); diff --git a/library/src/handle.cpp b/library/src/handle.cpp index 70b4cc7e..db571067 100644 --- a/library/src/handle.cpp +++ b/library/src/handle.cpp @@ -28,7 +28,7 @@ _rocsparse_handle::_rocsparse_handle() } else { - layer_mode = (rocsparse_layer_mode) (atoi(str_layer_mode)); + layer_mode = (rocsparse_layer_mode)(atoi(str_layer_mode)); } // Open log file @@ -79,7 +79,7 @@ rocsparse_status _rocsparse_handle::set_stream(hipStream_t user_stream) /******************************************************************************* * get stream ******************************************************************************/ -rocsparse_status _rocsparse_handle::get_stream(hipStream_t* user_stream) const +rocsparse_status _rocsparse_handle::get_stream(hipStream_t *user_stream) const { *user_stream = stream; return rocsparse_status_success; diff --git a/library/src/include/definitions.h b/library/src/include/definitions.h index 6077d917..51c3824f 100644 --- a/library/src/include/definitions.h +++ b/library/src/include/definitions.h @@ -14,34 +14,35 @@ * thereby it can include top-level definitions included by all ******************************************************************************/ -#define RETURN_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ - { \ - hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if(TMP_STATUS_FOR_CHECK != hipSuccess) \ - { \ - return get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ - } \ +#define RETURN_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if (TMP_STATUS_FOR_CHECK != hipSuccess) \ + { \ + return get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ + } \ } -#define THROW_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ - { \ - hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if(TMP_STATUS_FOR_CHECK != hipSuccess) \ - { \ - throw get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ - } \ + +#define THROW_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if (TMP_STATUS_FOR_CHECK != hipSuccess) \ + { \ + throw get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ + } \ } -#define PRINT_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ - { \ - hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if(TMP_STATUS_FOR_CHECK != hipSuccess) \ - { \ - fprintf(stderr, \ - "hip error code: %d at %s:%d\n", \ - TMP_STATUS_FOR_CHECK, \ - __FILE__, \ - __LINE__); \ - } \ +#define PRINT_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if (TMP_STATUS_FOR_CHECK != hipSuccess) \ + { \ + fprintf(stderr, \ + "hip error code: %d at %s:%d\n", \ + TMP_STATUS_FOR_CHECK, \ + __FILE__, \ + __LINE__); \ + } \ } #endif // DEFINITIONS_H diff --git a/library/src/include/handle.h b/library/src/include/handle.h index 4033631c..d3cda017 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -83,13 +83,16 @@ struct _rocsparse_hyb_mat rocsparse_int m = 0; // num cols rocsparse_int n = 0; + // partition type rocsparse_hyb_partition partition = rocsparse_hyb_partition_auto; + // ELL matrix part rocsparse_int ell_nnz = 0; rocsparse_int ell_width = 0; rocsparse_int *ell_col_ind = nullptr; void *ell_val = nullptr; + // COO matrix part rocsparse_int coo_nnz = 0; rocsparse_int *coo_row_ind = nullptr; diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index caae10e8..3de0b09a 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -8,6 +8,10 @@ #include +#ifdef __cplusplus +extern "C" { +#endif + /******************************************************************************** * \brief rocsparse_handle is a structure holding the rocsparse library context. * It must be initialized using rocsparse_create_handle() @@ -15,7 +19,6 @@ * to all subsequent library function calls. * It should be destroyed at the end using rocsparse_destroy_handle(). *******************************************************************************/ -extern "C" rocsparse_status rocsparse_create_handle(rocsparse_handle *handle) { // Check if handle is valid @@ -30,9 +33,8 @@ rocsparse_status rocsparse_create_handle(rocsparse_handle *handle) { *handle = new _rocsparse_handle(); log_trace(*handle, "rocsparse_create_handle"); - } - catch(const rocsparse_status &status) + catch (const rocsparse_status &status) { return status; } @@ -43,7 +45,6 @@ rocsparse_status rocsparse_create_handle(rocsparse_handle *handle) /******************************************************************************** * \brief destroy handle *******************************************************************************/ -extern "C" rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle) { log_trace(handle, "rocsparse_destroy_handle"); @@ -52,7 +53,7 @@ rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle) { delete handle; } - catch(const rocsparse_status &status) + catch (const rocsparse_status &status) { return status; } @@ -63,7 +64,6 @@ rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle) * \brief Indicates whether the scalar value pointers are on the host or device. * Set pointer mode, can be host or device *******************************************************************************/ -extern "C" rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, rocsparse_pointer_mode mode) { @@ -80,7 +80,6 @@ rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, /******************************************************************************** * \brief Get pointer mode, can be host or device. *******************************************************************************/ -extern "C" rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, rocsparse_pointer_mode *mode) { @@ -98,9 +97,7 @@ rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, *! \brief Set rocsparse stream used for all subsequent library function calls. * If not set, all hip kernels will take the default NULL stream. *******************************************************************************/ -extern "C" -rocsparse_status rocsparse_set_stream(rocsparse_handle handle, - hipStream_t stream_id) +rocsparse_status rocsparse_set_stream(rocsparse_handle handle, hipStream_t stream_id) { // Check if handle is valid if (handle == nullptr) @@ -114,9 +111,7 @@ rocsparse_status rocsparse_set_stream(rocsparse_handle handle, /******************************************************************************** *! \brief Get rocsparse stream used for all subsequent library function calls. *******************************************************************************/ -extern "C" -rocsparse_status rocsparse_get_stream(rocsparse_handle handle, - hipStream_t *stream_id) +rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t *stream_id) { // Check if handle is valid if (handle == nullptr) @@ -133,7 +128,6 @@ rocsparse_status rocsparse_get_stream(rocsparse_handle handle, * version / 100 % 1000 = minor version * version / 100000 = major version *******************************************************************************/ -extern "C" rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version) { // Check if handle is valid @@ -155,7 +149,6 @@ rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version) * calls that involve the matrix. * It should be destroyed at the end using rocsparse_destroy_mat_descr(). *******************************************************************************/ -extern "C" rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descr) { if (descr == nullptr) @@ -169,7 +162,7 @@ rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descr) { *descr = new _rocsparse_mat_descr; } - catch(const rocsparse_status &status) + catch (const rocsparse_status &status) { return status; } @@ -180,7 +173,6 @@ rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descr) /******************************************************************************** * \brief destroy matrix descriptor *******************************************************************************/ -extern "C" rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr) { // Destruct @@ -188,7 +180,7 @@ rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr) { delete descr; } - catch(const rocsparse_status &status) + catch (const rocsparse_status &status) { return status; } @@ -198,7 +190,6 @@ rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr) /******************************************************************************** * \brief Set the index base of the matrix descriptor. *******************************************************************************/ -extern "C" rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, rocsparse_index_base base) { @@ -207,8 +198,7 @@ rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, { return rocsparse_status_invalid_pointer; } - if (base != rocsparse_index_base_zero && - base != rocsparse_index_base_one) + if (base != rocsparse_index_base_zero && base != rocsparse_index_base_one) { return rocsparse_status_invalid_value; } @@ -219,7 +209,6 @@ rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, /******************************************************************************** * \brief Returns the index base of the matrix descriptor. *******************************************************************************/ -extern "C" rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr descr) { // If descriptor is invalid, default index base is returned @@ -233,7 +222,6 @@ rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr desc /******************************************************************************** * \brief Set the matrix type of the matrix descriptor. *******************************************************************************/ -extern "C" rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, rocsparse_matrix_type type) { @@ -255,7 +243,6 @@ rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, /******************************************************************************** * \brief Returns the matrix type of the matrix descriptor. *******************************************************************************/ -extern "C" rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr) { // If descriptor is invalid, default matrix type is returned @@ -273,7 +260,6 @@ rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr) * calls that involve the HYB matrix. * It should be destroyed at the end using rocsparse_destroy_hyb_mat(). *******************************************************************************/ -extern "C" rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat *hyb) { if (hyb == nullptr) @@ -287,7 +273,7 @@ rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat *hyb) { *hyb = new _rocsparse_hyb_mat; } - catch(const rocsparse_status &status) + catch (const rocsparse_status &status) { return status; } @@ -298,7 +284,6 @@ rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat *hyb) /******************************************************************************** * \brief Destroy HYB matrix. *******************************************************************************/ -extern "C" rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb) { // Destruct @@ -306,9 +291,13 @@ rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb) { delete hyb; } - catch(const rocsparse_status &status) + catch (const rocsparse_status &status) { return status; } return rocsparse_status_success; } + +#ifdef __cplusplus +} +#endif diff --git a/library/src/status.cpp b/library/src/status.cpp index a4bf803f..5b447f4f 100644 --- a/library/src/status.cpp +++ b/library/src/status.cpp @@ -10,11 +10,12 @@ /******************************************************************************* * \brief convert hipError_t to rocsparse_status - * TODO - enumerate library calls to hip runtime, enumerate possible errors from those calls + * TODO - enumerate library calls to hip runtime, enumerate possible errors from + * those calls ******************************************************************************/ rocsparse_status get_rocsparse_status_for_hip_status(hipError_t status) { - switch(status) + switch (status) { // success case hipSuccess: From 5d9d815e8d077d50574463bbcc8d64821c75785b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 14 May 2018 12:40:10 +0200 Subject: [PATCH 060/304] Jenkinsfile update --- Jenkinsfile | 299 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 228 insertions(+), 71 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 477f5f09..68329e6b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,6 +1,8 @@ #!/usr/bin/env groovy -// Generated from snippet generator 'properties; set job properties' +//////////////////////////////////////////////////////////////////////// +// Mostly generated from snippet generator 'properties; set job properties' +// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM properties([ pipelineTriggers([cron('0 3 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]), buildDiscarder(logRotator( @@ -18,6 +20,32 @@ properties([ // import hudson.FilePath; import java.nio.file.Path; +//////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////// +// Return build number of upstream job +@NonCPS +int get_upstream_build_num( ) +{ + def upstream_cause = currentBuild.rawBuild.getCause( hudson.model.Cause$UpstreamCause ) + if( upstream_cause == null) + return 0 + + return upstream_cause.getUpstreamBuild() +} + +//////////////////////////////////////////////////////////////////////// +// Return project name of upstream job +@NonCPS +String get_upstream_build_project( ) +{ + def upstream_cause = currentBuild.rawBuild.getCause( hudson.model.Cause$UpstreamCause ) + if( upstream_cause == null) + return null + + return upstream_cause.getUpstreamProject() +} + //////////////////////////////////////////////////////////////////////// // Calculate the relative path between two sub-directories from a common root @NonCPS @@ -34,16 +62,15 @@ String g_relativize( String root_string, String rel_source, String rel_build ) // Construct the relative path of the build directory void build_directory_rel( project_paths paths, compiler_data hcc_args ) { - // if( hcc_args.build_config.equalsIgnoreCase( 'release' ) ) - // { - // paths.project_build_prefix = paths.build_prefix + '/' + paths.project_name + '/release'; - // } - // else - // { - // paths.project_build_prefix = paths.build_prefix + '/' + paths.project_name + '/debug'; - // } +// if( hcc_args.build_config.equalsIgnoreCase( 'release' ) ) +// { +// paths.project_build_prefix = paths.build_prefix + '/' + paths.project_name + '/release'; +// } +// else +// { +// paths.project_build_prefix = paths.build_prefix + '/' + paths.project_name + '/debug'; +// } paths.project_build_prefix = paths.build_prefix + '/' + paths.project_name; - } //////////////////////////////////////////////////////////////////////// @@ -100,7 +127,7 @@ void checkout_and_version( project_paths paths ) // The docker images contains all dependencies, including OS platform, to build def docker_build_image( docker_data docker_args, project_paths paths ) { - String build_image_name = "build" + String build_image_name = "build-rocsparse-hip-artifactory" def build_image = null dir( paths.project_src_prefix ) @@ -122,7 +149,7 @@ def docker_build_image( docker_data docker_args, project_paths paths ) //////////////////////////////////////////////////////////////////////// // This encapsulates the cmake configure, build and package commands // Leverages docker containers to encapsulate the build in a fixed environment -Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, docker_data docker_args, project_paths paths ) +def docker_build_inside_image( def build_image, compiler_data compiler_args, docker_data docker_args, project_paths paths ) { // Construct a relative path from build directory to src directory; used to invoke cmake String rel_path_to_src = g_relativize( pwd( ), paths.project_src_prefix, paths.project_build_prefix ) @@ -137,13 +164,6 @@ Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, build_type_postfix = "-d" } - // For the nvidia path, we somewhat arbitrarily choose to use the hcc-ctu rocsparse package - String rocsparse_archive_path=compiler_args.compiler_name; - if( rocsparse_archive_path.toLowerCase( ).startsWith( 'nvcc-' ) ) - { - rocsparse_archive_path='hcc-ctu' - } - build_image.inside( docker_args.docker_run_args ) { withEnv(["CXX=${compiler_args.compiler_path}", 'CLICOLOR_FORCE=1']) @@ -159,12 +179,12 @@ Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, stage( "Test ${compiler_args.compiler_name} ${compiler_args.build_config}" ) { // Cap the maximum amount of testing to be a few hours; assume failure if the time limit is hit - timeout(time: 1, unit: 'HOURS') + timeout(time: 2, unit: 'HOURS') { sh """#!/usr/bin/env bash set -x cd ${paths.project_build_prefix}/build/release/clients/tests - ./rocsparse-test${build_type_postfix} --gtest_output=xml --gtest_color=yes + LD_LIBRARY_PATH=/opt/rocm/hcc/lib ./rocsparse-test${build_type_postfix} --gtest_output=xml --gtest_color=yes """ junit "${paths.project_build_prefix}/build/release/clients/tests/*.xml" } @@ -178,16 +198,15 @@ Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, make package """ - sh """#!/usr/bin/env bash - set -x - rm -rf ${docker_context} && mkdir -p ${docker_context} - mv ${paths.project_build_prefix}/build/release/*.deb ${docker_context} - # mv ${paths.project_build_prefix}/build/release/*.rpm ${docker_context} - dpkg -c ${docker_context}/*.deb - """ - - archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true - // archiveArtifacts artifacts: "${docker_context}/*.rpm", fingerprint: true + if( paths.project_name.equalsIgnoreCase( 'rocsparse-ubuntu' ) ) + { + sh """#!/usr/bin/env bash + set -x + rm -rf ${docker_context} && mkdir -p ${docker_context} + mv ${paths.project_build_prefix}/build/release/*.deb ${docker_context} + dpkg -c ${docker_context}/*.deb + """ + archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true // stage('Clang Format') // { @@ -202,12 +221,23 @@ Boolean docker_build_inside_image( def build_image, compiler_data compiler_args, // | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' // ''' // } - + } + else if( paths.project_name.equalsIgnoreCase( 'rocsparse-fedora' ) ) + { + sh """#!/usr/bin/env bash + set -x + rm -rf ${docker_context} && mkdir -p ${docker_context} + mv ${paths.project_build_prefix}/build/release/*.deb ${docker_context} + rpm -qlp ${docker_context}/*.rpm + """ + archiveArtifacts artifacts: "${docker_context}/*.rpm", fingerprint: true + } } + } } - return true + return void } //////////////////////////////////////////////////////////////////////// @@ -220,7 +250,7 @@ String docker_test_install( compiler_data compiler_args, docker_data docker_args String image_name = "rocsparse-hip-${compiler_args.compiler_name}-ubuntu-16.04" String docker_context = "${compiler_args.build_config}/${compiler_args.compiler_name}" - stage( "Artifactory ${compiler_args.compiler_name} ${compiler_args.build_config}" ) + stage( "Install ${compiler_args.compiler_name} ${compiler_args.build_config}" ) { // We copy the docker files into the bin directory where the .deb lives so that it's a clean build everytime sh """#!/usr/bin/env bash @@ -242,6 +272,62 @@ String docker_test_install( compiler_data compiler_args, docker_data docker_args return image_name } +//////////////////////////////////////////////////////////////////////// +// hip_integration_testing +// This function sets up compilation and testing of HiP on a compiler downloaded from an upstream build +// Integration testing is centered around docker and constructing clean test environments every time + +// NOTES: I have implemeneted integration testing 3 different ways, and I've come to the conclusion nothing is perfect +// 1. I've tried having HCC push the test compiler to artifactory, and having HiP download the test docker image from artifactory +// a. The act of uploading and downloading images from artifactory takes minutes +// b. There is no good way of deleting images from a repository. You have to use an arcane CURL command and I don't know how +// to keep the password secret. These test integration images are meant to be ephemeral. +// 2. I tried 'docker save' to export a docker image into a tarball, and transfering the image through 'copy artifacts plugin' +// a. The HCC docker image uncompressed is over 1GB +// b. Compressing the docker image takes even longer than uploading the image to artifactory +// 3. Download the HCC .deb and dockerfile through 'copy artifacts plugin'. Create a new HCC image on the fly +// a. There is inefficency in building a new ubuntu image and installing HCC twice (once in HCC build, once here) +// b. This solution doesn't scale when we start testing downstream libraries + +// I've implemented solution #3 above, probably transitioning to #2 down the line (probably without compression) +String hip_integration_testing( String inside_args, String job, String build_config ) +{ + // Attempt to make unique docker image names for each build, to support concurrent builds + // Mangle docker org name with upstream build info + String testing_org_name = 'hip-test-' + get_upstream_build_project( ).replaceAll('/','-') + '-' + get_upstream_build_num( ) + + // Tag image name with this build number + String hip_test_image_name = "hip:${env.BUILD_NUMBER}" + + def rocsparse_integration_image = null + + dir( 'integration-testing' ) + { + deleteDir( ) + + // This invokes 'copy artifact plugin' to copy archived files from upstream build + step([$class: 'CopyArtifact', filter: 'archive/**/*.deb, docker/dockerfile-*', + fingerprintArtifacts: true, projectName: get_upstream_build_project( ), flatten: true, + selector: [$class: 'TriggeredBuildSelector', allowUpstreamDependencies: false, fallbackToLastSuccessful: false, upstreamFilterStrategy: 'UseGlobalSetting'], + target: '.' ]) + + docker.build( "${testing_org_name}/${hip_test_image_name}", "-f dockerfile-hip-ubuntu-16.04 ." ) + } + + // Checkout source code, dependencies and version files + String rocsparse_src_rel = checkout_and_version( job ) + + // Conctruct a binary directory path based on build config + String rocsparse_bin_rel = build_directory_rel( build_config ) + + // Build rocsparse inside of the build environment + rocsparse_integration_image = docker_build_image( job, testing_org_name, '', rocsparse_src_rel, "${testing_org_name}/${hip_test_image_name}" ) + + docker_build_inside_image( rocsparse_integration_image, inside_args, job, '', build_config, rocsparse_src_rel, rocsparse_bin_rel ) + + docker_clean_images( testing_org_name, '*' ) +} + // Docker related variables gathered together to reduce parameter bloat on function calls class docker_data implements Serializable { @@ -271,14 +357,14 @@ class project_paths implements Serializable String build_command } +//////////////////////////////////////////////////////////////////////// +// -- MAIN +// Following this line is the start of MAIN of this Jenkinsfile + // This defines a common build pipeline used by most targets def build_pipeline( compiler_data compiler_args, docker_data docker_args, project_paths rocsparse_paths, def docker_inside_closure ) { ansiColor( 'vga' ) - { - // NOTE: build_succeeded does not appear to be local to each function invokation. I couldn't use it where each - // node had a different success value. - def build_succeeded = false; stage( "Build ${compiler_args.compiler_name} ${compiler_args.build_config}" ) { @@ -295,13 +381,12 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec rocsparse_build_image.inside( docker_args.docker_run_args, docker_inside_closure ) // Build rocsparse inside of the build environment - build_succeeded = docker_build_inside_image( rocsparse_build_image, compiler_args, docker_args, rocsparse_paths ) + docker_build_inside_image( rocsparse_build_image, compiler_args, docker_args, rocsparse_paths ) } - // After a successful build, test the installer - // Only do this for rocm based builds - if( compiler_args.compiler_name.toLowerCase( ).startsWith( 'hcc-' ) ) + if( !rocsparse_paths.project_name.equalsIgnoreCase( 'rocsparse-hcc-ctu' ) ) { + // After a successful build, upload a docker image of the results String job_name = env.JOB_NAME.toLowerCase( ) String rocsparse_image_name = docker_test_install( compiler_args, docker_args, rocsparse_paths, job_name ) @@ -313,38 +398,45 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec // The following launches 3 builds in parallel: hcc-ctu, hcc-1.6 and cuda parallel hcc_ctu: { - node( 'docker && rocm && dkms' ) + try { - def docker_args = new docker_data( - from_image:'compute-artifactory:5001/rocm-developer-tools/hip/master/hip-hcc-ctu-ubuntu-16.04:latest', - build_docker_file:'dockerfile-build-ubuntu-16.04', - install_docker_file:'dockerfile-install-ubuntu-16.04', - docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', - docker_build_args:' --pull' ) - - def compiler_args = new compiler_data( - compiler_name:'hcc-ctu', - build_config:'Release', - compiler_path:'/opt/rocm/bin/hcc' ) - - def rocsparse_paths = new project_paths( - project_name:'rocsparse-hcc-ctu', - src_prefix:'src', - build_prefix:'src', - build_command: './install.sh -cd' ) + node( 'docker && rocm && dkms') + { + def docker_args = new docker_data( + from_image:'compute-artifactory:5001/rocm-developer-tools/hip/master/hip-hcc-ctu-ubuntu-16.04:latest', + build_docker_file:'dockerfile-build-ubuntu-16.04', + install_docker_file:'dockerfile-install-ubuntu-16.04', + docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', + docker_build_args:' --pull' ) + + def compiler_args = new compiler_data( + compiler_name:'hcc-ctu', + build_config:'Release', + compiler_path:'/opt/rocm/bin/hcc' ) + + def rocsparse_paths = new project_paths( + project_name:'rocsparse-hcc-ctu', + src_prefix:'src', + build_prefix:'src', + build_command: './install.sh -cd' ) + + def print_version_closure = { + sh """ + set -x + /opt/rocm/bin/rocm_agent_enumerator -t ALL + /opt/rocm/bin/hcc --version + """ + } - def print_version_closure = { - sh """ - set -x - /opt/rocm/bin/rocm_agent_enumerator -t ALL - /opt/rocm/bin/hcc --version - """ + build_pipeline( compiler_args, docker_args, rocsparse_paths, print_version_closure ) } - - build_pipeline( compiler_args, docker_args, rocsparse_paths, print_version_closure ) + } + catch( err ) + { + currentBuild.result = 'UNSTABLE' } }, -hcc_rocm: +rocm_ubuntu: { node( 'docker && rocm && dkms' ) { @@ -356,12 +448,12 @@ hcc_rocm: docker_build_args:' --pull' ) def hcc_compiler_args = new compiler_data( - compiler_name:'hcc-rocm', + compiler_name:'hcc-rocm-ubuntu-16.04', build_config:'Release', compiler_path:'/opt/rocm/bin/hcc' ) def rocsparse_paths = new project_paths( - project_name:'rocsparse-hcc-rocm', + project_name:'rocsparse-hcc-rocm-ubuntu-16.04', src_prefix:'src', build_prefix:'src', build_command: './install.sh -cd' ) @@ -377,3 +469,68 @@ hcc_rocm: build_pipeline( hcc_compiler_args, hcc_docker_args, rocsparse_paths, print_version_closure ) } } +//, +// rocm_fedora: +// { +// node( 'docker && rocm && dkms') +// { +// def hcc_docker_args = new docker_data( +// from_image:'rocm/dev-fedora-24:latest', +// build_docker_file:'dockerfile-build-fedora', +// install_docker_file:'dockerfile-install-fedora', +// docker_run_args:'--device=/dev/kfd', +// docker_build_args:' --pull' ) + +// def hcc_compiler_args = new compiler_data( +// compiler_name:'hcc-rocm-fedora', +// build_config:'Release', +// compiler_path:'/opt/rocm/bin/hcc' ) + +// def rocsparse_paths = new project_paths( +// project_name:'rocsparse-fedora', +// src_prefix:'src', +// build_prefix:'src', +// build_command: './install.sh -c' ) + +// def print_version_closure = { +// sh """ +// set -x +// /opt/rocm/bin/hcc --version +// """ +// } + +// build_pipeline( hcc_compiler_args, hcc_docker_args, rocsparse_paths, print_version_closure ) +// } +// }, +// nvcc: +// { +// node( 'docker && cuda' ) +// { +// def hcc_docker_args = new docker_data( +// from_image:'nvidia/cuda:9.0-devel', +// build_docker_file:'dockerfile-build-nvidia-cuda', +// install_docker_file:'dockerfile-install-nvidia-cuda', +// docker_run_args:'--runtime=nvidia', +// docker_build_args:' --pull' ) + +// def hcc_compiler_args = new compiler_data( +// compiler_name:'nvcc-9.0', +// build_config:'Release', +// compiler_path:'/opt/rocm/bin/hipcc' ) + +// def rocsparse_paths = new project_paths( +// project_name:'rocsparse-cuda', +// src_prefix:'src', +// build_prefix:'build' ) + +// def print_version_closure = { +// sh """ +// set -x +// nvidia-smi +// nvcc --version +// """ +// } + +// build_pipeline( hcc_compiler_args, hcc_docker_args, rocsparse_paths, print_version_closure ) +// } +// } From 8118b03ab1d6db5f1fa304f620a8a7587459c1b4 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 14 May 2018 13:24:58 +0200 Subject: [PATCH 061/304] Jenkinsfile fixed bracket --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 68329e6b..4745b64d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -392,7 +392,7 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec docker_clean_images( job_name, rocsparse_image_name ) } - } + } // The following launches 3 builds in parallel: hcc-ctu, hcc-1.6 and cuda From 9d5461cc62e9d7bc51603913916426b6d0ce291f Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 14 May 2018 13:56:38 +0200 Subject: [PATCH 062/304] bracketfix jenkinsfile --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4745b64d..c973a150 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -365,7 +365,7 @@ class project_paths implements Serializable def build_pipeline( compiler_data compiler_args, docker_data docker_args, project_paths rocsparse_paths, def docker_inside_closure ) { ansiColor( 'vga' ) - + { stage( "Build ${compiler_args.compiler_name} ${compiler_args.build_config}" ) { // Checkout source code, dependencies and version files @@ -392,7 +392,7 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec docker_clean_images( job_name, rocsparse_image_name ) } - + } } // The following launches 3 builds in parallel: hcc-ctu, hcc-1.6 and cuda From 3cd8d50cc5a624c7619997a934d1896a556c3e1b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 14 May 2018 14:12:08 +0200 Subject: [PATCH 063/304] Jenkinsfile update --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index c973a150..6184fd60 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -247,7 +247,7 @@ def docker_build_inside_image( def build_image, compiler_data compiler_args, doc String docker_test_install( compiler_data compiler_args, docker_data docker_args, project_paths rocsparse_paths, String job_name ) { def rocsparse_install_image = null - String image_name = "rocsparse-hip-${compiler_args.compiler_name}-ubuntu-16.04" + String image_name = "rocsparse-hip-${compiler_args.compiler_name}" String docker_context = "${compiler_args.build_config}/${compiler_args.compiler_name}" stage( "Install ${compiler_args.compiler_name} ${compiler_args.build_config}" ) From 03fb4048b0656a46637e2a4fd51b9c9fdbfd1b5c Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 14 May 2018 22:28:44 +0200 Subject: [PATCH 064/304] Jenkins update --- Jenkinsfile | 20 +++++++++---------- ...d-ubuntu-16.04 => dockerfile-build-ubuntu} | 0 ...ubuntu-16.04 => dockerfile-install-ubuntu} | 0 3 files changed, 10 insertions(+), 10 deletions(-) rename docker/{dockerfile-build-ubuntu-16.04 => dockerfile-build-ubuntu} (100%) rename docker/{dockerfile-install-ubuntu-16.04 => dockerfile-install-ubuntu} (100%) diff --git a/Jenkinsfile b/Jenkinsfile index 6184fd60..4e2bd26e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -227,7 +227,7 @@ def docker_build_inside_image( def build_image, compiler_data compiler_args, doc sh """#!/usr/bin/env bash set -x rm -rf ${docker_context} && mkdir -p ${docker_context} - mv ${paths.project_build_prefix}/build/release/*.deb ${docker_context} + mv ${paths.project_build_prefix}/build/release/*.rpm ${docker_context} rpm -qlp ${docker_context}/*.rpm """ archiveArtifacts artifacts: "${docker_context}/*.rpm", fingerprint: true @@ -400,12 +400,12 @@ parallel hcc_ctu: { try { - node( 'docker && rocm && dkms') + node( 'docker && rocm && gfx900') { def docker_args = new docker_data( from_image:'compute-artifactory:5001/rocm-developer-tools/hip/master/hip-hcc-ctu-ubuntu-16.04:latest', - build_docker_file:'dockerfile-build-ubuntu-16.04', - install_docker_file:'dockerfile-install-ubuntu-16.04', + build_docker_file:'dockerfile-build-ubuntu', + install_docker_file:'dockerfile-install-ubuntu', docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', docker_build_args:' --pull' ) @@ -438,22 +438,22 @@ parallel hcc_ctu: }, rocm_ubuntu: { - node( 'docker && rocm && dkms' ) + node( 'docker && rocm && gfx900') { def hcc_docker_args = new docker_data( from_image:'rocm/dev-ubuntu-16.04:1.7.1', - build_docker_file:'dockerfile-build-ubuntu-16.04', - install_docker_file:'dockerfile-install-ubuntu-16.04', + build_docker_file:'dockerfile-build-ubuntu', + install_docker_file:'dockerfile-install-ubuntu', docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', docker_build_args:' --pull' ) def hcc_compiler_args = new compiler_data( - compiler_name:'hcc-rocm-ubuntu-16.04', + compiler_name:'hcc-rocm-ubuntu', build_config:'Release', compiler_path:'/opt/rocm/bin/hcc' ) def rocsparse_paths = new project_paths( - project_name:'rocsparse-hcc-rocm-ubuntu-16.04', + project_name:'rocsparse-ubuntu', src_prefix:'src', build_prefix:'src', build_command: './install.sh -cd' ) @@ -472,7 +472,7 @@ rocm_ubuntu: //, // rocm_fedora: // { -// node( 'docker && rocm && dkms') +// node( 'docker && rocm && gfx900') // { // def hcc_docker_args = new docker_data( // from_image:'rocm/dev-fedora-24:latest', diff --git a/docker/dockerfile-build-ubuntu-16.04 b/docker/dockerfile-build-ubuntu similarity index 100% rename from docker/dockerfile-build-ubuntu-16.04 rename to docker/dockerfile-build-ubuntu diff --git a/docker/dockerfile-install-ubuntu-16.04 b/docker/dockerfile-install-ubuntu similarity index 100% rename from docker/dockerfile-install-ubuntu-16.04 rename to docker/dockerfile-install-ubuntu From 1f012347817c6af3682a25966e836f347441e792 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 15 May 2018 10:37:24 +0200 Subject: [PATCH 065/304] clang-format converter and clang check for jenkins --- .clang-format | 90 +++ Jenkinsfile | 26 +- clients/benchmarks/client.cpp | 28 +- clients/common/arg_check.cpp | 6 +- .../rocsparse_template_specialization.cpp | 84 ++- clients/common/unit.cpp | 3 +- clients/common/utility.cpp | 11 +- clients/include/arg_check.hpp | 9 +- clients/include/rocsparse.hpp | 33 +- clients/include/rocsparse_test_unique_ptr.hpp | 4 +- clients/include/testing_axpyi.hpp | 157 +++-- clients/include/testing_coo2csr.hpp | 115 ++-- clients/include/testing_csr2coo.hpp | 109 ++-- clients/include/testing_csrmv.hpp | 331 +++++------ clients/include/utility.hpp | 288 ++++----- clients/samples/example_csrmv.cpp | 113 ++-- clients/samples/example_ellmv.cpp | 90 +-- clients/samples/example_handle.cpp | 7 +- clients/tests/rocsparse_gtest_main.cpp | 2 +- clients/tests/test_axpyi.cpp | 17 +- clients/tests/test_coo2csr.cpp | 13 +- clients/tests/test_csr2coo.cpp | 13 +- clients/tests/test_csrmv.cpp | 14 +- library/include/rocsparse-auxiliary.h | 18 +- library/include/rocsparse-functions.h | 152 ++--- library/include/rocsparse-types.h | 6 +- library/src/conversion/coo2csr_device.h | 30 +- library/src/conversion/csr2coo_device.h | 15 +- library/src/conversion/csr2hyb_device.h | 57 +- library/src/conversion/rocsparse_coo2csr.cpp | 42 +- library/src/conversion/rocsparse_csr2coo.cpp | 163 ++++-- library/src/conversion/rocsparse_csr2hyb.cpp | 170 +++--- library/src/handle.cpp | 14 +- library/src/include/definitions.h | 6 +- library/src/include/handle.h | 28 +- library/src/include/logging.h | 4 +- library/src/include/utility.h | 4 +- library/src/level1/rocsparse_axpyi.cpp | 144 ++--- library/src/level2/csrmv_device.h | 103 ++-- library/src/level2/ellmv_device.h | 27 +- library/src/level2/rocsparse_csrmv.cpp | 547 ++++++++++++------ library/src/level2/rocsparse_hybmv.cpp | 181 +++--- library/src/rocsparse_auxiliary.cpp | 70 +-- library/src/status.cpp | 54 +- 44 files changed, 1874 insertions(+), 1524 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..22f26749 --- /dev/null +++ b/.clang-format @@ -0,0 +1,90 @@ +--- +Language: Cpp +AccessModifierOffset: 0 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: true +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: true +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterFunction: true + AfterNamespace: false + AfterObjCDeclaration: true + AfterStruct: true + AfterUnion: true + BeforeCatch: true + BeforeElse: true + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +ColumnLimit: 100 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + - Regex: '^(<|"(gtest|isl|json)/)' + Priority: 3 + - Regex: '.*' + Priority: 1 +IndentCaseLabels: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Left +ReflowComments: true +SortIncludes: false +SpaceAfterCStyleCast: false +# SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: Never +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseTab: Never +... + diff --git a/Jenkinsfile b/Jenkinsfile index 6184fd60..50f1a23b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -208,19 +208,19 @@ def docker_build_inside_image( def build_image, compiler_data compiler_args, doc """ archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true -// stage('Clang Format') -// { -// sh ''' -// find . -iname \'*.h\' \ -// -o -iname \'*.hpp\' \ -// -o -iname \'*.cpp\' \ -// -o -iname \'*.h.in\' \ -// -o -iname \'*.hpp.in\' \ -// -o -iname \'*.cpp.in\' \ -// | grep -v 'build/' \ -// | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' -// ''' -// } + stage('Clang Format') + { + sh ''' + find . -iname \'*.h\' \ + -o -iname \'*.hpp\' \ + -o -iname \'*.cpp\' \ + -o -iname \'*.h.in\' \ + -o -iname \'*.hpp.in\' \ + -o -iname \'*.cpp.in\' \ + | grep -v 'build/' \ + | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' + ''' + } } else if( paths.project_name.equalsIgnoreCase( 'rocsparse-fedora' ) ) { diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 8b790b07..cf1ffcdc 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -16,11 +16,11 @@ namespace po = boost::program_options; -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { Arguments argus; argus.unit_check = 0; - argus.timing = 1; + argus.timing = 1; std::string function; char precision = 's'; @@ -29,6 +29,7 @@ int main(int argc, char *argv[]) po::options_description desc("rocsparse client command line options"); desc.add_options()("help,h", "produces this help message") + // clang-format off ("sizem,m", po::value(&argus.M)->default_value(128), "Specific matrix size testing: sizem is only applicable to SPARSE-2 " @@ -77,18 +78,19 @@ int main(int argc, char *argv[]) ("device", po::value(&device_id)->default_value(0), "Set default device to be used for subsequent program runs"); + // clang-format on po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); po::notify(vm); - if (vm.count("help")) + if(vm.count("help")) { std::cout << desc << std::endl; return 0; } - if (precision != 's' && precision != 'd') + if(precision != 's' && precision != 'd') { fprintf(stderr, "Invalid value for --precision\n"); return -1; @@ -109,31 +111,31 @@ int main(int argc, char *argv[]) /* ============================================================================================ */ - if (argus.M < 0 || argus.N < 0) + if(argus.M < 0 || argus.N < 0) { fprintf(stderr, "Invalid dimension\n"); return -1; } - if (function == "axpyi") + if(function == "axpyi") { - if (precision == 's') + if(precision == 's') testing_axpyi(argus); - else if (precision == 'd') + else if(precision == 'd') testing_axpyi(argus); } - else if (function == "csrmv") + else if(function == "csrmv") { - if (precision == 's') + if(precision == 's') testing_csrmv(argus); - else if (precision == 'd') + else if(precision == 'd') testing_csrmv(argus); } - else if (function == "csr2coo") + else if(function == "csr2coo") { testing_csr2coo(argus); } - else if (function == "coo2csr") + else if(function == "coo2csr") { testing_coo2csr(argus); } diff --git a/clients/common/arg_check.cpp b/clients/common/arg_check.cpp index c12931ef..fecdcd57 100644 --- a/clients/common/arg_check.cpp +++ b/clients/common/arg_check.cpp @@ -25,8 +25,7 @@ } \ } -void verify_rocsparse_status_invalid_pointer(rocsparse_status status, - const char* message) +void verify_rocsparse_status_invalid_pointer(rocsparse_status status, const char* message) { #ifdef GOOGLE_TEST ASSERT_EQ(status, rocsparse_status_invalid_pointer); @@ -39,8 +38,7 @@ void verify_rocsparse_status_invalid_pointer(rocsparse_status status, #endif } -void verify_rocsparse_status_invalid_size(rocsparse_status status, - const char* message) +void verify_rocsparse_status_invalid_size(rocsparse_status status, const char* message) { #ifdef GOOGLE_TEST ASSERT_EQ(status, rocsparse_status_invalid_size); diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 36ea3646..9ea625c0 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -11,10 +11,10 @@ namespace rocsparse { template <> rocsparse_status rocsparse_axpyi(rocsparse_handle handle, rocsparse_int nnz, - const float *alpha, - const float *x_val, - const rocsparse_int *x_ind, - float *y, + const float* alpha, + const float* x_val, + const rocsparse_int* x_ind, + float* y, rocsparse_index_base idx_base) { return rocsparse_saxpyi(handle, nnz, alpha, x_val, x_ind, y, idx_base); @@ -23,10 +23,10 @@ rocsparse_status rocsparse_axpyi(rocsparse_handle handle, template <> rocsparse_status rocsparse_axpyi(rocsparse_handle handle, rocsparse_int nnz, - const double *alpha, - const double *x_val, - const rocsparse_int *x_ind, - double *y, + const double* alpha, + const double* x_val, + const rocsparse_int* x_ind, + double* y, rocsparse_index_base idx_base) { return rocsparse_daxpyi(handle, nnz, alpha, x_val, x_ind, y, idx_base); @@ -34,70 +34,66 @@ rocsparse_status rocsparse_axpyi(rocsparse_handle handle, template <> rocsparse_status rocsparse_csrmv(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, - const float *alpha, + const float* alpha, const rocsparse_mat_descr descr, - const float *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const float *x, - const float *beta, - float *y) + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const float* x, + const float* beta, + float* y) { - return rocsparse_scsrmv(handle, trans, m, n, nnz, alpha, - descr, csr_val, csr_row_ptr, csr_col_ind, - x, beta, y); + return rocsparse_scsrmv( + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } template <> rocsparse_status rocsparse_csrmv(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, - const double *alpha, + const double* alpha, const rocsparse_mat_descr descr, - const double *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const double *x, - const double *beta, - double *y) + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const double* x, + const double* beta, + double* y) { - return rocsparse_dcsrmv(handle, trans, m, n, nnz, alpha, - descr, csr_val, csr_row_ptr, csr_col_ind, - x, beta, y); + return rocsparse_dcsrmv( + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } template <> rocsparse_status rocsparse_hybmv(rocsparse_handle handle, - rocsparse_operation trans, - const float *alpha, + rocsparse_operation trans, + const float* alpha, const rocsparse_mat_descr descr, const rocsparse_hyb_mat hyb, - const float *x, - const float *beta, - float *y) + const float* x, + const float* beta, + float* y) { - return rocsparse_shybmv(handle, trans, alpha, descr, - hyb, x, beta, y); + return rocsparse_shybmv(handle, trans, alpha, descr, hyb, x, beta, y); } template <> rocsparse_status rocsparse_hybmv(rocsparse_handle handle, - rocsparse_operation trans, - const double *alpha, + rocsparse_operation trans, + const double* alpha, const rocsparse_mat_descr descr, const rocsparse_hyb_mat hyb, - const double *x, - const double *beta, - double *y) + const double* x, + const double* beta, + double* y) { - return rocsparse_dhybmv(handle, trans, alpha, descr, - hyb, x, beta, y); + return rocsparse_dhybmv(handle, trans, alpha, descr, hyb, x, beta, y); } } // namespace rocsparse diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index 73ceb497..c0a8f11e 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -52,8 +52,7 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, double* hCPU, double* } template <> -void unit_check_general( - rocsparse_int M, rocsparse_int N, rocsparse_int* hCPU, rocsparse_int* hGPU) +void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int* hCPU, rocsparse_int* hGPU) { #pragma unroll for(rocsparse_int j = 0; j < N; j++) diff --git a/clients/common/utility.cpp b/clients/common/utility.cpp index 1e49ff43..7307e6df 100644 --- a/clients/common/utility.cpp +++ b/clients/common/utility.cpp @@ -46,12 +46,11 @@ rocsparse_int query_device_property() (int)(props.clockRate / 1000), props.major, props.minor); - printf( - "maxGridDimX %d, sharedMemPerBlock %ldKB, maxThreadsPerBlock %d, warpSize %d\n", - props.maxGridSize[0], - props.sharedMemPerBlock >> 10, - props.maxThreadsPerBlock, - props.warpSize); + printf("maxGridDimX %d, sharedMemPerBlock %ldKB, maxThreadsPerBlock %d, warpSize %d\n", + props.maxGridSize[0], + props.sharedMemPerBlock >> 10, + props.maxThreadsPerBlock, + props.warpSize); printf("-------------------------------------------------------------------------\n"); } diff --git a/clients/include/arg_check.hpp b/clients/include/arg_check.hpp index 3696690c..114d867c 100644 --- a/clients/include/arg_check.hpp +++ b/clients/include/arg_check.hpp @@ -9,15 +9,12 @@ #include -void verify_rocsparse_status_invalid_pointer(rocsparse_status status, - const char* message); +void verify_rocsparse_status_invalid_pointer(rocsparse_status status, const char* message); -void verify_rocsparse_status_invalid_size(rocsparse_status status, - const char* message); +void verify_rocsparse_status_invalid_size(rocsparse_status status, const char* message); void verify_rocsparse_status_invalid_handle(rocsparse_status status); -void verify_rocsparse_status_success(rocsparse_status status, - const char* message); +void verify_rocsparse_status_success(rocsparse_status status, const char* message); #endif // ARG_CHECK_HPP diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index 3fde8695..2e59a9d8 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -13,37 +13,36 @@ namespace rocsparse { template rocsparse_status rocsparse_axpyi(rocsparse_handle handle, rocsparse_int nnz, - const T *alpha, - const T *x_val, - const rocsparse_int *x_ind, - T *y, + const T* alpha, + const T* x_val, + const rocsparse_int* x_ind, + T* y, rocsparse_index_base idx_base); template rocsparse_status rocsparse_csrmv(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, - const T *alpha, + const T* alpha, const rocsparse_mat_descr descr, - const T *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const T *x, - const T *beta, - T *y); + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* x, + const T* beta, + T* y); template rocsparse_status rocsparse_hybmv(rocsparse_handle handle, rocsparse_operation trans, - const T *alpha, + const T* alpha, const rocsparse_mat_descr descr, const rocsparse_hyb_mat hyb, - const T *x, - const T *beta, - T *y); - + const T* x, + const T* beta, + T* y); } #endif // _ROCSPARSE_HPP_ diff --git a/clients/include/rocsparse_test_unique_ptr.hpp b/clients/include/rocsparse_test_unique_ptr.hpp index 9e06cb07..4c0fe481 100644 --- a/clients/include/rocsparse_test_unique_ptr.hpp +++ b/clients/include/rocsparse_test_unique_ptr.hpp @@ -37,9 +37,7 @@ static void* device_malloc(size_t byte_size) } // device_free wraps hipFree and provides same API as free -static void device_free(void* ptr) { - PRINT_IF_HIP_ERROR(hipFree(ptr)); -} +static void device_free(void* ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); } struct handle_struct { diff --git a/clients/include/testing_axpyi.hpp b/clients/include/testing_axpyi.hpp index f27346ad..131fe462 100644 --- a/clients/include/testing_axpyi.hpp +++ b/clients/include/testing_axpyi.hpp @@ -22,22 +22,21 @@ void testing_axpyi_bad_arg(void) rocsparse_int nnz = 100; rocsparse_int safe_size = 100; T alpha = 0.6; + rocsparse_index_base idx_base = rocsparse_index_base_zero; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); rocsparse_handle handle = unique_ptr_handle->handle; - auto dxVal_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*safe_size), device_free}; - auto dxInd_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(rocsparse_int)*safe_size), device_free}; - auto dy_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*safe_size), device_free}; + auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dxInd_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; - T *dxVal = (T*) dxVal_managed.get(); - rocsparse_int *dxInd = (rocsparse_int*) dxInd_managed.get(); - T *dy = (T*) dy_managed.get(); + T* dxVal = (T*)dxVal_managed.get(); + rocsparse_int* dxInd = (rocsparse_int*)dxInd_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dxInd || !dxVal || !dy) { @@ -45,33 +44,38 @@ void testing_axpyi_bad_arg(void) return; } - // testing for (nullptr == dxInd) + // testing for(nullptr == dxInd) { - rocsparse_int *dxInd_null = nullptr; + rocsparse_int* dxInd_null = nullptr; + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd_null, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: xInd is nullptr"); } - // testing for (nullptr == dxVal) + // testing for(nullptr == dxVal) { - T *dxVal_null = nullptr; + T* dxVal_null = nullptr; + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal_null, dxInd, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: xVal is nullptr"); } - // testing for (nullptr == dy) + // testing for(nullptr == dy) { - T *dy_null = nullptr; + T* dy_null = nullptr; + status = rocsparse_axpyi(handle, nnz, &alpha, dxVal, dxInd, dy_null, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); } - // testing for (nullptr == d_alpha) + // testing for(nullptr == d_alpha) { - T *d_alpha_null = nullptr; + T* d_alpha_null = nullptr; + status = rocsparse_axpyi(handle, nnz, d_alpha_null, dxVal, dxInd, dy, idx_base); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } - // testing for (nullptr == handle) + // testing for(nullptr == handle) { rocsparse_handle handle_null = nullptr; + status = rocsparse_axpyi(handle_null, nnz, &alpha, dxVal, dxInd, dy, idx_base); verify_rocsparse_status_invalid_handle(status); } @@ -93,29 +97,27 @@ rocsparse_status testing_axpyi(Arguments argus) // Argument sanity check before allocating invalid memory if(nnz <= 0) { - auto dxInd_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - auto dxVal_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T) * safe_size), device_free}; - auto dy_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T) * safe_size), device_free}; + auto dxInd_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dxVal_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; - rocsparse_int *dxInd = (rocsparse_int*) dxInd_managed.get(); - T *dxVal = (T*) dxVal_managed.get(); - T *dy = (T*) dy_managed.get(); + rocsparse_int* dxInd = (rocsparse_int*)dxInd_managed.get(); + T* dxVal = (T*)dxVal_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dxInd || !dxVal || !dy) { - verify_rocsparse_status_success( - rocsparse_status_memory_error, "!dxInd || !dxVal || !dy"); + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dxInd || !dxVal || !dy"); return rocsparse_status_memory_error; } - CHECK_ROCSPARSE_ERROR( - rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); status = rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy, idx_base); - if (nnz < 0) + if(nnz < 0) { verify_rocsparse_status_invalid_size(status, "Error: nnz < 0"); } @@ -145,79 +147,55 @@ rocsparse_status testing_axpyi(Arguments argus) hy_gold = hy_1; // allocate memory on device - auto dxInd_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(rocsparse_int)*nnz), device_free}; - auto dxVal_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*nnz), device_free}; - auto dy_1_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*N), device_free}; - auto dy_2_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*N), device_free}; - auto d_alpha_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)), device_free}; - - rocsparse_int *dxInd = (rocsparse_int*) dxInd_managed.get(); - T *dxVal = (T*) dxVal_managed.get(); - T *dy_1 = (T*) dy_1_managed.get(); - T *dy_2 = (T*) dy_2_managed.get(); - T *d_alpha = (T*) d_alpha_managed.get(); + auto dxInd_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dxVal_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + + rocsparse_int* dxInd = (rocsparse_int*)dxInd_managed.get(); + T* dxVal = (T*)dxVal_managed.get(); + T* dy_1 = (T*)dy_1_managed.get(); + T* dy_2 = (T*)dy_2_managed.get(); + T* d_alpha = (T*)d_alpha_managed.get(); if(!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha) { verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha"); + "!dxInd || !dxVal || !dy_1 || !dy_2 || !d_alpha"); return rocsparse_status_memory_error; } // copy data from CPU to device - CHECK_HIP_ERROR(hipMemcpy(dxInd, - hxInd.data(), - sizeof(rocsparse_int)*nnz, - hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dxVal, - hxVal.data(), - sizeof(T)*nnz, - hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_1, - hy_1.data(), - sizeof(T)*N, - hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dxInd, hxInd.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dxVal, hxVal.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * N, hipMemcpyHostToDevice)); if(argus.unit_check) { - CHECK_HIP_ERROR(hipMemcpy(dy_2, - hy_2.data(), - sizeof(T)*N, - hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_alpha, - &h_alpha, - sizeof(T), - hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); // ROCSPARSE pointer mode host - CHECK_ROCSPARSE_ERROR( - rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR( - rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idx_base)); + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, &h_alpha, dxVal, dxInd, dy_1, idx_base)); // ROCSPARSE pointer mode device - CHECK_ROCSPARSE_ERROR( - rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); - CHECK_ROCSPARSE_ERROR( - rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idx_base)); + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_axpyi(handle, nnz, d_alpha, dxVal, dxInd, dy_2, idx_base)); // copy output from device to CPU - CHECK_HIP_ERROR( - hipMemcpy(hy_1.data(), dy_1, sizeof(T)*N, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR( - hipMemcpy(hy_2.data(), dy_2, sizeof(T)*N, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * N, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * N, hipMemcpyDeviceToHost)); // CPU double cpu_time_used = get_time_us(); - for (rocsparse_int i=0; ihandle; auto coo_row_ind_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto csr_row_ptr_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - rocsparse_int *coo_row_ind = (rocsparse_int*) coo_row_ind_managed.get(); - rocsparse_int *csr_row_ptr = (rocsparse_int*) csr_row_ptr_managed.get(); + rocsparse_int* coo_row_ind = (rocsparse_int*)coo_row_ind_managed.get(); + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); - if (!coo_row_ind || - !csr_row_ptr) + if(!coo_row_ind || !csr_row_ptr) { PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); return; } - // Testing for (coo_row_ind == nullptr) + // Testing for(coo_row_ind == nullptr) { - rocsparse_int *coo_row_ind_null = nullptr; - status = rocsparse_coo2csr(handle, coo_row_ind_null, nnz, m, - csr_row_ptr, rocsparse_index_base_zero); + rocsparse_int* coo_row_ind_null = nullptr; + + status = rocsparse_coo2csr( + handle, coo_row_ind_null, nnz, m, csr_row_ptr, rocsparse_index_base_zero); verify_rocsparse_status_invalid_pointer(status, "Error: coo_row_ind is nullptr"); } - // Testing for (csr_row_ptr == nullptr) + // Testing for(csr_row_ptr == nullptr) { - rocsparse_int *csr_row_ptr_null = nullptr; - status = rocsparse_coo2csr(handle, coo_row_ind, nnz, m, - csr_row_ptr_null, rocsparse_index_base_zero); + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_coo2csr( + handle, coo_row_ind, nnz, m, csr_row_ptr_null, rocsparse_index_base_zero); verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); } - // Testing for (handle == nullptr) + // Testing for(handle == nullptr) { rocsparse_handle handle_null = nullptr; - status = rocsparse_coo2csr(handle_null, coo_row_ind, nnz, m, - csr_row_ptr, rocsparse_index_base_zero); + + status = rocsparse_coo2csr( + handle_null, coo_row_ind, nnz, m, csr_row_ptr, rocsparse_index_base_zero); verify_rocsparse_status_invalid_handle(status); } } @@ -76,7 +76,7 @@ rocsparse_status testing_coo2csr(Arguments argus) rocsparse_status status; double scale = 0.02; - if (m > 1000 || n > 1000) + if(m > 1000 || n > 1000) { scale = 2.0 / std::max(m, n); } @@ -86,20 +86,17 @@ rocsparse_status testing_coo2csr(Arguments argus) rocsparse_handle handle = unique_ptr_handle->handle; // Argument sanity check before allocating invalid memory - if (m <= 0 || n <= 0 || nnz <= 0) + if(m <= 0 || n <= 0 || nnz <= 0) { auto coo_row_ind_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto csr_row_ptr_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - rocsparse_int *coo_row_ind = (rocsparse_int*) coo_row_ind_managed.get(); - rocsparse_int *csr_row_ptr = (rocsparse_int*) csr_row_ptr_managed.get(); + rocsparse_int* coo_row_ind = (rocsparse_int*)coo_row_ind_managed.get(); + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); - if (!coo_row_ind || - !csr_row_ptr) + if(!coo_row_ind || !csr_row_ptr) { verify_rocsparse_status_success(rocsparse_status_memory_error, "!coo_row_ind || !csr_row_ptr"); @@ -108,10 +105,9 @@ rocsparse_status testing_coo2csr(Arguments argus) status = rocsparse_coo2csr(handle, coo_row_ind, nnz, m, csr_row_ptr, idx_base); - if (m < 0 || nnz < 0) + if(m < 0 || nnz < 0) { - verify_rocsparse_status_invalid_size(status, "Error: m < 0 || " - "nnz < 0"); + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || nnz < 0"); } else { @@ -125,8 +121,8 @@ rocsparse_status testing_coo2csr(Arguments argus) std::vector hcoo_row_ind(nnz); std::vector hcoo_col_ind(nnz); std::vector hcoo_val(nnz); - std::vector hcsr_row_ptr(m+1); - std::vector hcsr_row_ptr_gold(m+1, 0); + std::vector hcsr_row_ptr(m + 1); + std::vector hcsr_row_ptr_gold(m + 1, 0); // Sample initial COO matrix on CPU srand(12345ULL); @@ -134,69 +130,69 @@ rocsparse_status testing_coo2csr(Arguments argus) // Allocate memory on the device auto dcoo_row_ind_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*nnz), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; auto dcsr_row_ptr_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*(m+1)), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; - rocsparse_int *dcoo_row_ind = (rocsparse_int*) dcoo_row_ind_managed.get(); - rocsparse_int *dcsr_row_ptr = (rocsparse_int*) dcsr_row_ptr_managed.get(); + rocsparse_int* dcoo_row_ind = (rocsparse_int*)dcoo_row_ind_managed.get(); + rocsparse_int* dcsr_row_ptr = (rocsparse_int*)dcsr_row_ptr_managed.get(); - if (!dcoo_row_ind || !dcsr_row_ptr) + if(!dcoo_row_ind || !dcsr_row_ptr) { verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dcoo_row_ind || !dcsr_row_ptr"); + "!dcoo_row_ind || !dcsr_row_ptr"); return rocsparse_status_memory_error; } // Copy data from host to device - CHECK_HIP_ERROR(hipMemcpy(dcoo_row_ind, hcoo_row_ind.data(), - sizeof(rocsparse_int)*nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy( + dcoo_row_ind, hcoo_row_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); - if (argus.unit_check) + if(argus.unit_check) { - CHECK_ROCSPARSE_ERROR(rocsparse_coo2csr(handle, dcoo_row_ind, nnz, - m, dcsr_row_ptr, idx_base)); + CHECK_ROCSPARSE_ERROR( + rocsparse_coo2csr(handle, dcoo_row_ind, nnz, m, dcsr_row_ptr, idx_base)); // Copy output from device to host - CHECK_HIP_ERROR(hipMemcpy(hcsr_row_ptr.data(), dcsr_row_ptr, - sizeof(rocsparse_int)*(m+1), hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcsr_row_ptr.data(), + dcsr_row_ptr, + sizeof(rocsparse_int) * (m + 1), + hipMemcpyDeviceToHost)); // CPU double cpu_time_used = get_time_us(); // coo2csr on host - for (int i=0; ihandle; auto csr_row_ptr_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto coo_row_ind_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - rocsparse_int *csr_row_ptr = (rocsparse_int*) csr_row_ptr_managed.get(); - rocsparse_int *coo_row_ind = (rocsparse_int*) coo_row_ind_managed.get(); + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* coo_row_ind = (rocsparse_int*)coo_row_ind_managed.get(); - if (!csr_row_ptr || - !coo_row_ind) + if(!csr_row_ptr || !coo_row_ind) { PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); return; } - // Testing for (csr_row_ptr == nullptr) + // Testing for(csr_row_ptr == nullptr) { - rocsparse_int *csr_row_ptr_null = nullptr; - status = rocsparse_csr2coo(handle, csr_row_ptr_null, nnz, m, - coo_row_ind, rocsparse_index_base_zero); + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_csr2coo( + handle, csr_row_ptr_null, nnz, m, coo_row_ind, rocsparse_index_base_zero); verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); } - // Testing for (coo_row_ind == nullptr) + // Testing for(coo_row_ind == nullptr) { - rocsparse_int *coo_row_ind_null = nullptr; - status = rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, - coo_row_ind_null, rocsparse_index_base_zero); + rocsparse_int* coo_row_ind_null = nullptr; + + status = rocsparse_csr2coo( + handle, csr_row_ptr, nnz, m, coo_row_ind_null, rocsparse_index_base_zero); verify_rocsparse_status_invalid_pointer(status, "Error: coo_row_ind is nullptr"); } - // Testing for (handle == nullptr) + // Testing for(handle == nullptr) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csr2coo(handle_null, csr_row_ptr, nnz, m, - coo_row_ind, rocsparse_index_base_zero); + + status = rocsparse_csr2coo( + handle_null, csr_row_ptr, nnz, m, coo_row_ind, rocsparse_index_base_zero); verify_rocsparse_status_invalid_handle(status); } } @@ -76,7 +76,7 @@ rocsparse_status testing_csr2coo(Arguments argus) rocsparse_status status; double scale = 0.02; - if (m > 1000 || n > 1000) + if(m > 1000 || n > 1000) { scale = 2.0 / std::max(m, n); } @@ -86,20 +86,17 @@ rocsparse_status testing_csr2coo(Arguments argus) rocsparse_handle handle = unique_ptr_handle->handle; // Argument sanity check before allocating invalid memory - if (m <= 0 || n <= 0 || nnz <= 0) + if(m <= 0 || n <= 0 || nnz <= 0) { auto csr_row_ptr_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto coo_row_ind_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - rocsparse_int *csr_row_ptr = (rocsparse_int*) csr_row_ptr_managed.get(); - rocsparse_int *coo_row_ind = (rocsparse_int*) coo_row_ind_managed.get(); + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* coo_row_ind = (rocsparse_int*)coo_row_ind_managed.get(); - if (!csr_row_ptr || - !coo_row_ind) + if(!csr_row_ptr || !coo_row_ind) { verify_rocsparse_status_success(rocsparse_status_memory_error, "!csr_row_ptr || !coo_row_ind"); @@ -108,10 +105,9 @@ rocsparse_status testing_csr2coo(Arguments argus) status = rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, coo_row_ind, idx_base); - if (m < 0 || nnz < 0) + if(m < 0 || nnz < 0) { - verify_rocsparse_status_invalid_size(status, "Error: m < 0 || " - "nnz < 0"); + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || nnz < 0"); } else { @@ -134,68 +130,66 @@ rocsparse_status testing_csr2coo(Arguments argus) gen_matrix_coo(m, n, nnz, hcoo_row_ind_gold, hcoo_col_ind, hcoo_val, idx_base); // Convert COO to CSR - std::vector hcsr_row_ptr(m+1); + std::vector hcsr_row_ptr(m + 1); // csr2coo on host - for (int i=0; i unique_ptr_descr(new descr_struct); rocsparse_mat_descr descr = unique_ptr_descr->descr; - auto dptr_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; - auto dcol_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; - auto dval_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*safe_size), - device_free}; - auto dx_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*safe_size), - device_free}; - auto dy_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*safe_size), - device_free}; - - rocsparse_int *dptr = (rocsparse_int*) dptr_managed.get(); - rocsparse_int *dcol = (rocsparse_int*) dcol_managed.get(); - T *dval = (T*) dval_managed.get(); - T *dx = (T*) dx_managed.get(); - T *dy = (T*) dy_managed.get(); + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dval || !dptr || !dcol || !dx || !dy) { @@ -63,67 +55,76 @@ void testing_csrmv_bad_arg(void) return; } - // testing for (nullptr == dptr) + // testing for(nullptr == dptr) { - rocsparse_int *dptr_null = nullptr; - status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, - dval, dptr_null, dcol, dx, &beta, dy); + rocsparse_int* dptr_null = nullptr; + + status = rocsparse_csrmv( + handle, trans, m, n, nnz, &alpha, descr, dval, dptr_null, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); } - // testing for (nullptr == dcol) + // testing for(nullptr == dcol) { - rocsparse_int *dcol_null = nullptr; - status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, - dval, dptr, dcol_null, dx, &beta, dy); + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_csrmv( + handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol_null, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } - // testing for (nullptr == dval) + // testing for(nullptr == dval) { - T *dval_null = nullptr; - status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, - dval_null, dptr, dcol, dx, &beta, dy); + T* dval_null = nullptr; + + status = rocsparse_csrmv( + handle, trans, m, n, nnz, &alpha, descr, dval_null, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); } - // testing for (nullptr == dx) + // testing for(nullptr == dx) { - T *dx_null = nullptr; - status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, - dval, dptr, dcol, dx_null, &beta, dy); + T* dx_null = nullptr; + + status = rocsparse_csrmv( + handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx_null, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); } - // testing for (nullptr == dy) + // testing for(nullptr == dy) { - T *dy_null = nullptr; - status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, - dval, dptr, dcol, dx, &beta, dy_null); + T* dy_null = nullptr; + + status = rocsparse_csrmv( + handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy_null); verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); } - // testing for (nullptr == d_alpha) + // testing for(nullptr == d_alpha) { - T *d_alpha_null = nullptr; - status = rocsparse_csrmv(handle, trans, m, n, nnz, d_alpha_null, descr, - dval, dptr, dcol, dx, &beta, dy); + T* d_alpha_null = nullptr; + + status = rocsparse_csrmv( + handle, trans, m, n, nnz, d_alpha_null, descr, dval, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } - // testing for (nullptr == d_beta) + // testing for(nullptr == d_beta) { - T *d_beta_null = nullptr; - status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr, - dval, dptr, dcol, dx, d_beta_null, dy); + T* d_beta_null = nullptr; + + status = rocsparse_csrmv( + handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, d_beta_null, dy); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } - // testing for (nullptr == descr) + // testing for(nullptr == descr) { rocsparse_mat_descr descr_null = nullptr; - status = rocsparse_csrmv(handle, trans, m, n, nnz, &alpha, descr_null, - dval, dptr, dcol, dx, &beta, dy); + + status = rocsparse_csrmv( + handle, trans, m, n, nnz, &alpha, descr_null, dval, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } - // testing for (nullptr == handle) + // testing for(nullptr == handle) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csrmv(handle_null, trans, m, n, nnz, &alpha, descr, - dval, dptr, dcol, dx, &beta, dy); + + status = rocsparse_csrmv( + handle_null, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_handle(status); } } @@ -148,7 +149,7 @@ rocsparse_status testing_csrmv(Arguments argus) // Determine number of non-zero elements double scale = 0.02; - if (m > 1000 || n > 1000) + if(m > 1000 || n > 1000) { scale = 2.0 / std::max(m, n); } @@ -157,44 +158,34 @@ rocsparse_status testing_csrmv(Arguments argus) // Argument sanity check before allocating invalid memory if(m <= 0 || n <= 0 || nnz <= 0) { - auto dptr_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; - auto dcol_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(rocsparse_int)*safe_size), - device_free}; - auto dval_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*safe_size), - device_free}; - auto dx_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*safe_size), - device_free}; - auto dy_managed = rocsparse_unique_ptr{ - device_malloc(sizeof(T)*safe_size), - device_free}; - - rocsparse_int *dptr = (rocsparse_int*) dptr_managed.get(); - rocsparse_int *dcol = (rocsparse_int*) dcol_managed.get(); - T *dval = (T*) dval_managed.get(); - T *dx = (T*) dx_managed.get(); - T *dy = (T*) dy_managed.get(); - - if (!dval || !dptr || !dcol || !dx || !dy) + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dval || !dptr || !dcol || !dx || !dy) { verify_rocsparse_status_success(rocsparse_status_memory_error, "!dptr || !dcol || !dval || !dx || !dy"); return rocsparse_status_memory_error; } - CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, - rocsparse_pointer_mode_host)); - status = rocsparse_csrmv(handle, trans, m, n, nnz, &h_alpha, - descr, dval, dptr, dcol, dx, &h_beta, dy); + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_csrmv( + handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy); - if (m < 0 || n < 0 || nnz < 0) + if(m < 0 || n < 0 || nnz < 0) { - verify_rocsparse_status_invalid_size(status, "Error: m < 0 || " - "n < 0 || nnz < 0"); + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); } else { @@ -212,19 +203,17 @@ rocsparse_status testing_csrmv(Arguments argus) // Initial Data on CPU srand(12345ULL); - if (argus.laplacian) + if(argus.laplacian) { - m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, - hcol_ind, hval, idx_base); - nnz = hcsr_row_ptr[m]; + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcol_ind, hval, idx_base); + nnz = hcsr_row_ptr[m]; } else { - if (argus.filename != "") + if(argus.filename != "") { - if (read_mtx_matrix(argus.filename.c_str(), - m, n, nnz, - hcoo_row_ind, hcol_ind, hval) != 0) + if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval) != + 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; @@ -236,18 +225,18 @@ rocsparse_status testing_csrmv(Arguments argus) } // Convert COO to CSR - if (!argus.laplacian) + if(!argus.laplacian) { - hcsr_row_ptr.resize(m+1, 0); - for (int i=0; i& A, rocsparse_int M, rocsparse_int N) /*! \brief vector initialization: */ // initialize sparse index vector with nnz entries ranging from start to end template -void rocsparse_init_index(I *x, rocsparse_int nnz, - rocsparse_int start, rocsparse_int end) +void rocsparse_init_index(I* x, rocsparse_int nnz, rocsparse_int start, rocsparse_int end) { - std::vector check(end-start, false); + std::vector check(end - start, false); int num = 0; - while (num < nnz) + while(num < nnz) { - rocsparse_int val = start + rand() % (end-start); - if (!check[val-start]) + rocsparse_int val = start + rand() % (end - start); + if(!check[val - start]) { - x[num] = val; - check[val-start] = true; + x[num] = val; + check[val - start] = true; ++num; } } - std::sort(x, x+nnz); + std::sort(x, x + nnz); }; /* ============================================================================================ */ /*! \brief csr matrix initialization */ template -void rocsparse_init_csr(std::vector &ptr, std::vector &col, - std::vector &val, - rocsparse_int nrow, rocsparse_int ncol, rocsparse_int nnz) +void rocsparse_init_csr(std::vector& ptr, + std::vector& col, + std::vector& val, + rocsparse_int nrow, + rocsparse_int ncol, + rocsparse_int nnz) { // Row offsets - ptr[0] = 0; + ptr[0] = 0; ptr[nrow] = nnz; - for (rocsparse_int i=1; i(); } @@ -150,40 +152,41 @@ void rocsparse_init_csr(std::vector &ptr, std::vector rocsparse_int gen_2d_laplacian(rocsparse_int ndim, - std::vector &rowptr, - std::vector &col, - std::vector &val, + std::vector& rowptr, + std::vector& col, + std::vector& val, rocsparse_index_base idx_base) { - if (ndim == 0) { + if(ndim == 0) + { return 0; } - rocsparse_int n = ndim * ndim; + rocsparse_int n = ndim * ndim; rocsparse_int nnz_mat = n * 5 - ndim * 4; - rowptr.resize(n+1); + rowptr.resize(n + 1); col.resize(nnz_mat); val.resize(nnz_mat); rocsparse_int nnz = 0; // Fill local arrays - for (rocsparse_int i=0; i(-1); ++nnz; } // if no left boundary element, connect with left neighbor - if (j != 0) + if(j != 0) { col[nnz] = idx - 1 + idx_base; val[nnz] = static_cast(-1); @@ -194,14 +197,14 @@ rocsparse_int gen_2d_laplacian(rocsparse_int ndim, val[nnz] = static_cast(4); ++nnz; // if no right boundary element, connect with right neighbor - if (j != ndim - 1) + if(j != ndim - 1) { col[nnz] = idx + 1 + idx_base; val[nnz] = static_cast(-1); ++nnz; } // if no lower boundary element, connect with lower neighbor - if (i != ndim - 1) + if(i != ndim - 1) { col[nnz] = idx + ndim + idx_base; val[nnz] = static_cast(-1); @@ -220,26 +223,26 @@ template void gen_matrix_coo(rocsparse_int m, rocsparse_int n, rocsparse_int nnz, - std::vector &row_ind, - std::vector &col_ind, - std::vector &val, + std::vector& row_ind, + std::vector& col_ind, + std::vector& val, rocsparse_index_base idx_base) { - if (row_ind.size() != nnz) + if(row_ind.size() != nnz) { row_ind.resize(nnz); } - if (col_ind.size() != nnz) + if(col_ind.size() != nnz) { col_ind.resize(nnz); } - if (val.size() != nnz) + if(val.size() != nnz) { val.resize(nnz); } // Uniform distributed row indices - for (rocsparse_int i=0; i check(nnz, false); - rocsparse_int i=0; - while (i < nnz) + rocsparse_int i = 0; + while(i < nnz) { rocsparse_int begin = i; - while (row_ind[i] == row_ind[begin]) + while(row_ind[i] == row_ind[begin]) + { ++i; + } // Sample i disjunct column indices rocsparse_int idx = begin; - while (idx < i) + while(idx < i) { // Normal distribution around the diagonal - rocsparse_int rng = (i - begin) - * sqrt(-2.0 * log((double) rand() / RAND_MAX)) - * cos(2.0 * M_PI * (double) rand() / RAND_MAX); + rocsparse_int rng = (i - begin) * sqrt(-2.0 * log((double)rand() / RAND_MAX)) * + cos(2.0 * M_PI * (double)rand() / RAND_MAX); - if (m <= n) + if(m <= n) { rng += row_ind[begin]; } // Repeat if running out of bounds - if (rng < 0 || rng > n-1) + if(rng < 0 || rng > n - 1) + { continue; + } // Check for disjunct column index in current row - if (!check[rng]) + if(!check[rng]) { - check[rng] = true; + check[rng] = true; col_ind[idx] = rng; ++idx; } } // Reset disjunct check array - for (rocsparse_int j=begin; j();//(double) rand() / RAND_MAX; + val[i] = random_generator(); //(double) rand() / RAND_MAX; } - } /* ============================================================================================ */ /*! \brief Read matrix from mtx file in COO format */ template -rocsparse_int read_mtx_matrix(const char *filename, - rocsparse_int &nrow, - rocsparse_int &ncol, - rocsparse_int &nnz, - std::vector &row, - std::vector &col, - std::vector &val) +rocsparse_int read_mtx_matrix(const char* filename, + rocsparse_int& nrow, + rocsparse_int& ncol, + rocsparse_int& nnz, + std::vector& row, + std::vector& col, + std::vector& val) { - FILE *f = fopen(filename, "r"); - if (!f) + FILE* f = fopen(filename, "r"); + if(!f) { return -1; } @@ -330,7 +337,7 @@ rocsparse_int read_mtx_matrix(const char *filename, char line[1024]; // Check for banner - if (!fgets(line, 1024, f)) + if(!fgets(line, 1024, f)) { return -1; } @@ -342,44 +349,47 @@ rocsparse_int read_mtx_matrix(const char *filename, char type[16]; // Extract banner - if (sscanf(line, "%s %s %s %s %s", banner, array, coord, data, type) != 5) + if(sscanf(line, "%s %s %s %s %s", banner, array, coord, data, type) != 5) { return -1; } // Convert to lower case - for (char *p=array; *p!='\0'; *p=tolower(*p), p++); - for (char *p=coord; *p!='\0'; *p=tolower(*p), p++); - for (char *p=data; *p!='\0'; *p=tolower(*p), p++); - for (char *p=type; *p!='\0'; *p=tolower(*p), p++); + for(char *p = array; *p != '\0'; *p = tolower(*p), p++) + ; + for(char *p = coord; *p != '\0'; *p = tolower(*p), p++) + ; + for(char *p = data; *p != '\0'; *p = tolower(*p), p++) + ; + for(char *p = type; *p != '\0'; *p = tolower(*p), p++) + ; // Check banner - if (strncmp(line, "%%MatrixMarket", 14) != 0) + if(strncmp(line, "%%MatrixMarket", 14) != 0) { return -1; } // Check array type - if (strcmp(array, "matrix") != 0) + if(strcmp(array, "matrix") != 0) { return -1; } // Check coord - if (strcmp(coord, "coordinate") != 0) + if(strcmp(coord, "coordinate") != 0) { return -1; } // Check data - if (strcmp(data, "real") != 0) + if(strcmp(data, "real") != 0) { return -1; } // Check type - if (strcmp(type, "general") != 0 && - strcmp(type, "symmetric") != 0) + if(strcmp(type, "general") != 0 && strcmp(type, "symmetric") != 0) { return -1; } @@ -390,7 +400,7 @@ rocsparse_int read_mtx_matrix(const char *filename, // Skip comments while(fgets(line, 1024, f)) { - if (line[0] != '%') + if(line[0] != '%') { break; } @@ -421,20 +431,17 @@ rocsparse_int read_mtx_matrix(const char *filename, row[idx] = irow; col[idx] = icol; - val[idx] = (T) dval; + val[idx] = (T)dval; ++idx; - if (symm && irow != icol) { - + if(symm && irow != icol) + { row[idx] = icol; col[idx] = irow; - val[idx] = (T) dval; - + val[idx] = (T)dval; ++idx; - } - } fclose(f); @@ -444,35 +451,37 @@ rocsparse_int read_mtx_matrix(const char *filename, /* ============================================================================================ */ /*! \brief Convert matrix from COO to CSR format */ template -void coo_to_csr(rocsparse_int nrow, rocsparse_int ncol, rocsparse_int nnz, - const std::vector &src_row, - const std::vector &src_col, - const std::vector &src_val, - std::vector &dst_ptr, - std::vector &dst_col, - std::vector &dst_val) +void coo_to_csr(rocsparse_int nrow, + rocsparse_int ncol, + rocsparse_int nnz, + const std::vector& src_row, + const std::vector& src_col, + const std::vector& src_val, + std::vector& dst_ptr, + std::vector& dst_col, + std::vector& dst_val) { - dst_ptr.resize(nrow+1, 0); + dst_ptr.resize(nrow + 1, 0); dst_col.resize(nnz); dst_val.resize(nnz); // Compute nnz entries per row - for (rocsparse_int i=0; i #include -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { // Parse command line - if (argc < 2) + if(argc < 2) { fprintf(stderr, "%s [ ]\n", argv[0]); return -1; } - int ndim = atoi(argv[1]); - int trials = 200; + int ndim = atoi(argv[1]); + int trials = 200; int batch_size = 1; - if (argc > 2) + if(argc > 2) { trials = atoi(argv[2]); } - if (argc > 3) + if(argc > 3) { batch_size = atoi(argv[3]); } @@ -46,14 +46,14 @@ int main(int argc, char *argv[]) std::vector hAptr; std::vector hAcol; std::vector hAval; - int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); - int n = m; + int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); + int n = m; int nnz = hAptr[m]; // Sample some random data srand(12345ULL); - double halpha = (double) rand() / RAND_MAX; + double halpha = static_cast(rand()) / RAND_MAX; double hbeta = 0.0; std::vector hx(m); @@ -64,32 +64,40 @@ int main(int argc, char *argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - int *dAptr = NULL; - int *dAcol = NULL; - double *dAval = NULL; - double *dx = NULL; - double *dy = NULL; - - hipMalloc((void**) &dAptr, sizeof(int)*(m+1)); - hipMalloc((void**) &dAcol, sizeof(int)*nnz); - hipMalloc((void**) &dAval, sizeof(double)*nnz); - hipMalloc((void**) &dx, sizeof(double)*m); - hipMalloc((void**) &dy, sizeof(double)*m); - - hipMemcpy(dAptr, hAptr.data(), sizeof(int)*(m+1), hipMemcpyHostToDevice); - hipMemcpy(dAcol, hAcol.data(), sizeof(int)*nnz, hipMemcpyHostToDevice); - hipMemcpy(dAval, hAval.data(), sizeof(double)*nnz, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(double)*m, hipMemcpyHostToDevice); + int* dAptr = NULL; + int* dAcol = NULL; + double* dAval = NULL; + double* dx = NULL; + double* dy = NULL; + + hipMalloc((void**)&dAptr, sizeof(int) * (m + 1)); + hipMalloc((void**)&dAcol, sizeof(int) * nnz); + hipMalloc((void**)&dAval, sizeof(double) * nnz); + hipMalloc((void**)&dx, sizeof(double) * m); + hipMalloc((void**)&dy, sizeof(double) * m); + + hipMemcpy(dAptr, hAptr.data(), sizeof(int) * (m + 1), hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double) * m, hipMemcpyHostToDevice); // Warm up - for (int i=0; i<10; ++i) + for(int i = 0; i < 10; ++i) { // Call rocsparse csrmv - rocsparse_dcsrmv(handle, rocsparse_operation_none, - m, n, nnz, - &halpha, descrA, - dAval, dAptr, dAcol, - dx, &hbeta, dy); + rocsparse_dcsrmv(handle, + rocsparse_operation_none, + m, + n, + nnz, + &halpha, + descrA, + dAval, + dAptr, + dAcol, + dx, + &hbeta, + dy); } // Device synchronization @@ -99,32 +107,45 @@ int main(int argc, char *argv[]) double time = get_time_us(); // CSR matrix vector multiplication - for (int i=0; i(sizeof(double)*(2*m+nnz) - +sizeof(rocsparse_int)*(m+1+nnz))/time/1e6; - double gflops = static_cast(2*nnz)/time/1e6; + time = (get_time_us() - time) / (trials * batch_size * 1e3); + double bandwidth = static_cast(sizeof(double) * (2 * m + nnz) + + sizeof(rocsparse_int) * (m + 1 + nnz)) / + time / 1e6; + double gflops = static_cast(2 * nnz) / time / 1e6; printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tusec\n"); printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", - m, n, nnz, halpha, hbeta, gflops, bandwidth, time); - - - + m, + n, + nnz, + halpha, + hbeta, + gflops, + bandwidth, + time); // Clear up on device hipFree(dAptr); diff --git a/clients/samples/example_ellmv.cpp b/clients/samples/example_ellmv.cpp index 5d01c84d..c0445c2f 100644 --- a/clients/samples/example_ellmv.cpp +++ b/clients/samples/example_ellmv.cpp @@ -9,24 +9,24 @@ #include #include -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { // Parse command line - if (argc < 2) + if(argc < 2) { fprintf(stderr, "%s [ ]\n", argv[0]); return -1; } - int ndim = atoi(argv[1]); - int trials = 200; + int ndim = atoi(argv[1]); + int trials = 200; int batch_size = 1; - if (argc > 2) + if(argc > 2) { trials = atoi(argv[2]); } - if (argc > 3) + if(argc > 3) { batch_size = atoi(argv[3]); } @@ -46,14 +46,14 @@ int main(int argc, char *argv[]) std::vector hAptr; std::vector hAcol; std::vector hAval; - int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); - int n = m; + int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); + int n = m; int nnz = hAptr[m]; // Sample some random data srand(12345ULL); - double halpha = (double) rand() / RAND_MAX; + double halpha = static_cast(rand()) / RAND_MAX; double hbeta = 0.0; std::vector hx(m); @@ -64,22 +64,22 @@ int main(int argc, char *argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - int *dAptr = NULL; - int *dAcol = NULL; - double *dAval = NULL; - double *dx = NULL; - double *dy = NULL; - - hipMalloc((void**) &dAptr, sizeof(int)*(m+1)); - hipMalloc((void**) &dAcol, sizeof(int)*nnz); - hipMalloc((void**) &dAval, sizeof(double)*nnz); - hipMalloc((void**) &dx, sizeof(double)*m); - hipMalloc((void**) &dy, sizeof(double)*m); - - hipMemcpy(dAptr, hAptr.data(), sizeof(int)*(m+1), hipMemcpyHostToDevice); - hipMemcpy(dAcol, hAcol.data(), sizeof(int)*nnz, hipMemcpyHostToDevice); - hipMemcpy(dAval, hAval.data(), sizeof(double)*nnz, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(double)*m, hipMemcpyHostToDevice); + int* dAptr = NULL; + int* dAcol = NULL; + double* dAval = NULL; + double* dx = NULL; + double* dy = NULL; + + hipMalloc((void**)&dAptr, sizeof(int) * (m + 1)); + hipMalloc((void**)&dAcol, sizeof(int) * nnz); + hipMalloc((void**)&dAval, sizeof(double) * nnz); + hipMalloc((void**)&dx, sizeof(double) * m); + hipMalloc((void**)&dy, sizeof(double) * m); + + hipMemcpy(dAptr, hAptr.data(), sizeof(int) * (m + 1), hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double) * m, hipMemcpyHostToDevice); // Convert CSR matrix to HYB format, using partition type to be // rocsparse_hyb_partition_max. This will result in ELL matrix format, @@ -87,9 +87,8 @@ int main(int argc, char *argv[]) rocsparse_hyb_mat hybA; rocsparse_create_hyb_mat(&hybA); - rocsparse_dcsr2hyb(handle, m, n, - descrA, dAval, dAptr, dAcol, - hybA, 0, rocsparse_hyb_partition_max); + rocsparse_dcsr2hyb( + handle, m, n, descrA, dAval, dAptr, dAcol, hybA, 0, rocsparse_hyb_partition_max); // Clean up CSR structures hipFree(dAptr); @@ -97,12 +96,10 @@ int main(int argc, char *argv[]) hipFree(dAval); // Warm up - for (int i=0; i<10; ++i) + for(int i = 0; i < 10; ++i) { // Call rocsparse hybmv - rocsparse_dhybmv(handle, rocsparse_operation_none, - &halpha, descrA, hybA, - dx, &hbeta, dy); + rocsparse_dhybmv(handle, rocsparse_operation_none, &halpha, descrA, hybA, dx, &hbeta, dy); } // Device synchronization @@ -112,14 +109,13 @@ int main(int argc, char *argv[]) double time = get_time_us(); // HYB matrix vector multiplication - for (int i=0; i(sizeof(double)*(2*m+nnz) - +sizeof(rocsparse_int)*(nnz))/time/1e6; - double gflops = static_cast(2*nnz)/time/1e6; + double bandwidth = + static_cast(sizeof(double) * (2 * m + nnz) + sizeof(rocsparse_int) * (nnz)) / time / + 1e6; + double gflops = static_cast(2 * nnz) / time / 1e6; printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tusec\n"); printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", - m, n, nnz, halpha, hbeta, gflops, bandwidth, time); - - // Clean up + m, + n, + nnz, + halpha, + hbeta, + gflops, + bandwidth, + time); + + // Clear up on device rocsparse_destroy_hyb_mat(hybA); rocsparse_destroy_mat_descr(descrA); rocsparse_destroy_handle(handle); diff --git a/clients/samples/example_handle.cpp b/clients/samples/example_handle.cpp index 08c1160a..4147d8d8 100644 --- a/clients/samples/example_handle.cpp +++ b/clients/samples/example_handle.cpp @@ -5,7 +5,7 @@ #include #include -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { rocsparse_handle handle; rocsparse_create_handle(&handle); @@ -13,10 +13,7 @@ int main(int argc, char *argv[]) int version; rocsparse_get_version(handle, &version); - printf("rocSPARSE version %d.%d.%d\n", - version / 100000, - version / 100 % 1000, - version % 100); + printf("rocSPARSE version %d.%d.%d\n", version / 100000, version / 100 % 1000, version % 100); rocsparse_destroy_handle(handle); diff --git a/clients/tests/rocsparse_gtest_main.cpp b/clients/tests/rocsparse_gtest_main.cpp index 91583232..06581252 100644 --- a/clients/tests/rocsparse_gtest_main.cpp +++ b/clients/tests/rocsparse_gtest_main.cpp @@ -14,7 +14,7 @@ int main(int argc, char** argv) { // Device Query - int device_id = 0; + int device_id = 0; int device_count = query_device_property(); if(device_count <= device_id) diff --git a/clients/tests/test_axpyi.cpp b/clients/tests/test_axpyi.cpp index 7d694753..00118a1a 100644 --- a/clients/tests/test_axpyi.cpp +++ b/clients/tests/test_axpyi.cpp @@ -12,11 +12,12 @@ typedef rocsparse_index_base base; typedef std::tuple axpyi_tuple; -int axpyi_N_range[] = {12000, 15332, 22031}; +int axpyi_N_range[] = {12000, 15332, 22031}; int axpyi_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; + std::vector axpyi_alpha_range = {1.0, 0.0}; -base axpyi_idx_base_range[] = {rocsparse_index_base_zero, - rocsparse_index_base_one}; + +base axpyi_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; class parameterized_axpyi : public testing::TestWithParam { @@ -38,14 +39,12 @@ Arguments setup_axpyi_arguments(axpyi_tuple tup) return arg; } -TEST(axpyi_bad_arg, axpyi_float) -{ - testing_axpyi_bad_arg(); -} +TEST(axpyi_bad_arg, axpyi_float) { testing_axpyi_bad_arg(); } TEST_P(parameterized_axpyi, axpyi_float) { Arguments arg = setup_axpyi_arguments(GetParam()); + rocsparse_status status = testing_axpyi(arg); EXPECT_EQ(status, rocsparse_status_success); } @@ -53,11 +52,13 @@ TEST_P(parameterized_axpyi, axpyi_float) TEST_P(parameterized_axpyi, axpyi_double) { Arguments arg = setup_axpyi_arguments(GetParam()); + rocsparse_status status = testing_axpyi(arg); EXPECT_EQ(status, rocsparse_status_success); } -INSTANTIATE_TEST_CASE_P(axpyi, parameterized_axpyi, +INSTANTIATE_TEST_CASE_P(axpyi, + parameterized_axpyi, testing::Combine(testing::ValuesIn(axpyi_N_range), testing::ValuesIn(axpyi_nnz_range), testing::ValuesIn(axpyi_alpha_range), diff --git a/clients/tests/test_coo2csr.cpp b/clients/tests/test_coo2csr.cpp index d6c6bc27..1ba84553 100644 --- a/clients/tests/test_coo2csr.cpp +++ b/clients/tests/test_coo2csr.cpp @@ -13,8 +13,8 @@ typedef std::tuple coo2csr_tuple; int coo_M_range[] = {-1, 0, 10, 500, 872, 1000}; int coo_N_range[] = {-3, 0, 33, 242, 623, 1000}; -rocsparse_index_base coo_idx_base_range[] = {rocsparse_index_base_zero, - rocsparse_index_base_one}; + +rocsparse_index_base coo_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; class parameterized_coo2csr : public testing::TestWithParam { @@ -35,19 +35,18 @@ Arguments setup_coo2csr_arguments(coo2csr_tuple tup) return arg; } -TEST(coo2csr_bad_arg, coo2csr) -{ - testing_coo2csr_bad_arg(); -} +TEST(coo2csr_bad_arg, coo2csr) { testing_coo2csr_bad_arg(); } TEST_P(parameterized_coo2csr, coo2csr) { Arguments arg = setup_coo2csr_arguments(GetParam()); + rocsparse_status status = testing_coo2csr(arg); EXPECT_EQ(status, rocsparse_status_success); } -INSTANTIATE_TEST_CASE_P(coo2csr, parameterized_coo2csr, +INSTANTIATE_TEST_CASE_P(coo2csr, + parameterized_coo2csr, testing::Combine(testing::ValuesIn(coo_M_range), testing::ValuesIn(coo_N_range), testing::ValuesIn(coo_idx_base_range))); diff --git a/clients/tests/test_csr2coo.cpp b/clients/tests/test_csr2coo.cpp index 101b440c..bc68cc3c 100644 --- a/clients/tests/test_csr2coo.cpp +++ b/clients/tests/test_csr2coo.cpp @@ -13,8 +13,8 @@ typedef std::tuple csr2coo_tuple; int coo_M_range[] = {-1, 0, 10, 500, 872, 1000}; int coo_N_range[] = {-3, 0, 33, 242, 623, 1000}; -rocsparse_index_base coo_idx_base_range[] = {rocsparse_index_base_zero, - rocsparse_index_base_one}; + +rocsparse_index_base coo_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; class parameterized_csr2coo : public testing::TestWithParam { @@ -35,19 +35,18 @@ Arguments setup_csr2coo_arguments(csr2coo_tuple tup) return arg; } -TEST(csr2coo_bad_arg, csr2coo) -{ - testing_csr2coo_bad_arg(); -} +TEST(csr2coo_bad_arg, csr2coo) { testing_csr2coo_bad_arg(); } TEST_P(parameterized_csr2coo, csr2coo) { Arguments arg = setup_csr2coo_arguments(GetParam()); + rocsparse_status status = testing_csr2coo(arg); EXPECT_EQ(status, rocsparse_status_success); } -INSTANTIATE_TEST_CASE_P(csr2coo, parameterized_csr2coo, +INSTANTIATE_TEST_CASE_P(csr2coo, + parameterized_csr2coo, testing::Combine(testing::ValuesIn(coo_M_range), testing::ValuesIn(coo_N_range), testing::ValuesIn(coo_idx_base_range))); diff --git a/clients/tests/test_csrmv.cpp b/clients/tests/test_csrmv.cpp index 72c972c6..bf4042ac 100644 --- a/clients/tests/test_csrmv.cpp +++ b/clients/tests/test_csrmv.cpp @@ -14,8 +14,10 @@ typedef std::tuple csrmv_tuple; int csr_M_range[] = {-1, 0, 10, 500, 7111, 10000}; int csr_N_range[] = {-3, 0, 33, 842, 4441, 10000}; + std::vector csr_alpha_range = {2.0, 3.0}; -std::vector csr_beta_range = {0.0, 1.0}; +std::vector csr_beta_range = {0.0, 1.0}; + base csr_idxbase_range[] = {rocsparse_index_base_zero}; class parameterized_csrmv : public testing::TestWithParam @@ -39,14 +41,12 @@ Arguments setup_csrmv_arguments(csrmv_tuple tup) return arg; } -TEST(csrmv_bad_arg, csrmv_float) -{ - testing_csrmv_bad_arg(); -} +TEST(csrmv_bad_arg, csrmv_float) { testing_csrmv_bad_arg(); } TEST_P(parameterized_csrmv, csrmv_float) { Arguments arg = setup_csrmv_arguments(GetParam()); + rocsparse_status status = testing_csrmv(arg); EXPECT_EQ(status, rocsparse_status_success); } @@ -54,11 +54,13 @@ TEST_P(parameterized_csrmv, csrmv_float) TEST_P(parameterized_csrmv, csrmv_double) { Arguments arg = setup_csrmv_arguments(GetParam()); + rocsparse_status status = testing_csrmv(arg); EXPECT_EQ(status, rocsparse_status_success); } -INSTANTIATE_TEST_CASE_P(csrmv, parameterized_csrmv, +INSTANTIATE_TEST_CASE_P(csrmv, + parameterized_csrmv, testing::Combine(testing::ValuesIn(csr_M_range), testing::ValuesIn(csr_N_range), testing::ValuesIn(csr_alpha_range), diff --git a/library/include/rocsparse-auxiliary.h b/library/include/rocsparse-auxiliary.h index d21710a3..31f2f879 100644 --- a/library/include/rocsparse-auxiliary.h +++ b/library/include/rocsparse-auxiliary.h @@ -28,7 +28,7 @@ extern "C" { * It should be destroyed at the end using rocsparse_destroy_handle(). *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_create_handle(rocsparse_handle *handle); +rocsparse_status rocsparse_create_handle(rocsparse_handle* handle); /******************************************************************************** * \brief Destroy handle. @@ -46,7 +46,7 @@ rocsparse_status rocsparse_set_stream(rocsparse_handle handle, hipStream_t strea * \brief Get stream [0] from handle. *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t *stream); +rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t* stream); /******************************************************************************** * \brief Set rocsparse_pointer_mode. @@ -60,7 +60,7 @@ rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, *******************************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, - rocsparse_pointer_mode *pointer_mode); + rocsparse_pointer_mode* pointer_mode); /******************************************************************************** * \brief Get rocSPARSE version @@ -69,7 +69,7 @@ rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, * version / 100000 = major version *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version); +rocsparse_status rocsparse_get_version(rocsparse_handle handle, rocsparse_int* version); /******************************************************************************** * \brief rocsparse_create_mat_descr_t is a structure holding the rocsparse matrix @@ -79,7 +79,7 @@ rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version); * It should be destroyed at the end using rocsparse_destroy_mat_descr(). *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descr); +rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr* descr); /******************************************************************************** * \brief Destroy the matrix descriptor. @@ -91,8 +91,7 @@ rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr); * \brief Set the index base of the matrix descriptor. *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, - rocsparse_index_base base); +rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, rocsparse_index_base base); /******************************************************************************** * \brief Returns the index base of the matrix descriptor. @@ -104,8 +103,7 @@ rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr desc * \brief Set the matrix type of the matrix descriptor. *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, - rocsparse_matrix_type type); +rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, rocsparse_matrix_type type); /******************************************************************************** * \brief Returns the matrix type of the matrix descriptor. @@ -121,7 +119,7 @@ rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr); * It should be destroyed at the end using rocsparse_destroy_hyb_mat(). *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat *hyb); +rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat* hyb); /******************************************************************************** * \brief Destroy HYB matrix. diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 5018915b..f142afe0 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -55,37 +55,37 @@ extern "C" { ROCSPARSE_EXPORT rocsparse_status rocsparse_saxpyi(rocsparse_handle handle, rocsparse_int nnz, - const float *alpha, - const float *x_val, - const rocsparse_int *x_ind, - float *y, + const float* alpha, + const float* x_val, + const rocsparse_int* x_ind, + float* y, rocsparse_index_base idx_base); ROCSPARSE_EXPORT rocsparse_status rocsparse_daxpyi(rocsparse_handle handle, rocsparse_int nnz, - const double *alpha, - const double *x_val, - const rocsparse_int *x_ind, - double *y, + const double* alpha, + const double* x_val, + const rocsparse_int* x_ind, + double* y, rocsparse_index_base idx_base); /* ROCSPARSE_EXPORT rocsparse_status rocsparse_caxpyi(rocsparse_handle handle, rocsparse_int nnz, - const rocsparse_float_complex *alpha, - const rocsparse_float_complex *x_val, - const rocsparse_int *x_ind, - rocsparse_float_complex *y, + const rocsparse_float_complex* alpha, + const rocsparse_float_complex* x_val, + const rocsparse_int* x_ind, + rocsparse_float_complex* y, rocsparse_index_base idx_base); ROCSPARSE_EXPORT rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, rocsparse_int nnz, - const rocsparse_double_complex *alpha, - const rocsparse_double_complex *x_val, - const rocsparse_int *x_ind, - rocsparse_double_complex *y, + const rocsparse_double_complex* alpha, + const rocsparse_double_complex* x_val, + const rocsparse_int* x_ind, + rocsparse_double_complex* y, rocsparse_index_base idx_base); */ @@ -142,14 +142,14 @@ rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, - const float *alpha, + const float* alpha, const rocsparse_mat_descr descr, - const float *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const float *x, - const float *beta, - float *y); + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const float* x, + const float* beta, + float* y); ROCSPARSE_EXPORT rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, @@ -157,14 +157,14 @@ rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, - const double *alpha, + const double* alpha, const rocsparse_mat_descr descr, - const double *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const double *x, - const double *beta, - double *y); + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const double* x, + const double* beta, + double* y); /* ROCSPARSE_EXPORT rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, @@ -172,14 +172,14 @@ rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, - const rocsparse_float_complex *alpha, + const rocsparse_float_complex* alpha, const rocsparse_mat_descr descr, - const rocsparse_float_complex *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const rocsparse_float_complex *x, - const rocsparse_float_complex *beta, - rocsparse_float_complex *y); + const rocsparse_float_complex* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_float_complex* x, + const rocsparse_float_complex* beta, + rocsparse_float_complex* y); ROCSPARSE_EXPORT rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, @@ -187,14 +187,14 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, rocsparse_int m, rocsparse_int n, rocsparse_int nnz, - const rocsparse_double_complex *alpha, + const rocsparse_double_complex* alpha, const rocsparse_mat_descr descr, - const rocsparse_double_complex *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const rocsparse_double_complex *x, - const rocsparse_double_complex *beta, - rocsparse_double_complex *y); + const rocsparse_double_complex* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_double_complex* x, + const rocsparse_double_complex* beta, + rocsparse_double_complex* y); */ /*! \brief SPARSE Level 2 API @@ -230,42 +230,42 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, ROCSPARSE_EXPORT rocsparse_status rocsparse_shybmv(rocsparse_handle handle, rocsparse_operation trans, - const float *alpha, + const float* alpha, const rocsparse_mat_descr descr, const rocsparse_hyb_mat hyb, - const float *x, - const float *beta, - float *y); + const float* x, + const float* beta, + float* y); ROCSPARSE_EXPORT rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, rocsparse_operation trans, - const double *alpha, + const double* alpha, const rocsparse_mat_descr descr, const rocsparse_hyb_mat hyb, - const double *x, - const double *beta, - double *y); + const double* x, + const double* beta, + double* y); /* ROCSPARSE_EXPORT rocsparse_status rocsparse_shybmv(rocsparse_handle handle, rocsparse_operation trans, - const rocsparse_float_complex *alpha, + const rocsparse_float_complex* alpha, const rocsparse_mat_descr descr, const rocsparse_hyb_mat hyb, - const rocsparse_float_complex *x, - const rocsparse_float_complex *beta, - rocsparse_float_complex *y); + const rocsparse_float_complex* x, + const rocsparse_float_complex* beta, + rocsparse_float_complex* y); ROCSPARSE_EXPORT rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, rocsparse_operation trans, - const rocsparse_double_complex *alpha, + const rocsparse_double_complex* alpha, const rocsparse_mat_descr descr, const rocsparse_hyb_mat hyb, - const rocsparse_double_complex *x, - const rocsparse_double_complex *beta, - rocsparse_double_complex *y); + const rocsparse_double_complex* x, + const rocsparse_double_complex* beta, + rocsparse_double_complex* y); */ /* * =========================================================================== @@ -303,10 +303,10 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, - const rocsparse_int *csr_row_ptr, + const rocsparse_int* csr_row_ptr, rocsparse_int nnz, rocsparse_int m, - rocsparse_int *coo_row_ind, + rocsparse_int* coo_row_ind, rocsparse_index_base idx_base); /*! \brief SPARSE Format Conversions API @@ -333,10 +333,10 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, - const rocsparse_int *coo_row_ind, + const rocsparse_int* coo_row_ind, rocsparse_int nnz, rocsparse_int m, - rocsparse_int *csr_row_ptr, + rocsparse_int* csr_row_ptr, rocsparse_index_base idx_base); /*! \brief SPARSE Format Conversions API @@ -379,9 +379,9 @@ rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, rocsparse_int m, rocsparse_int n, const rocsparse_mat_descr descr, - const float *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, rocsparse_hyb_mat hyb, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type); @@ -391,9 +391,9 @@ rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, rocsparse_int m, rocsparse_int n, const rocsparse_mat_descr descr, - const double *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, rocsparse_hyb_mat hyb, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type); @@ -403,9 +403,9 @@ rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, rocsparse_int m, rocsparse_int n, const rocsparse_mat_descr descr, - const rocsparse_float_complex *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, + const rocsparse_float_complex* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, rocsparse_hyb_mat hyb, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type); @@ -415,9 +415,9 @@ rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, rocsparse_int m, rocsparse_int n, const rocsparse_mat_descr descr, - const rocsparse_double_complex *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, + const rocsparse_double_complex* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, rocsparse_hyb_mat hyb, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type); diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index cc300752..8f696068 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -20,9 +20,9 @@ typedef int64_t rocsparse_int; typedef int32_t rocsparse_int; #endif -typedef struct _rocsparse_handle *rocsparse_handle; -typedef struct _rocsparse_mat_descr *rocsparse_mat_descr; -typedef struct _rocsparse_hyb_mat *rocsparse_hyb_mat; +typedef struct _rocsparse_handle* rocsparse_handle; +typedef struct _rocsparse_mat_descr* rocsparse_mat_descr; +typedef struct _rocsparse_hyb_mat* rocsparse_hyb_mat; #ifdef __cplusplus extern "C" { diff --git a/library/src/conversion/coo2csr_device.h b/library/src/conversion/coo2csr_device.h index 761c0873..c63e1912 100644 --- a/library/src/conversion/coo2csr_device.h +++ b/library/src/conversion/coo2csr_device.h @@ -8,20 +8,19 @@ #include -__device__ -rocsparse_int lower_bound(const rocsparse_int *arr, - rocsparse_int key, - rocsparse_int low, - rocsparse_int high) +__device__ rocsparse_int lower_bound(const rocsparse_int* arr, + rocsparse_int key, + rocsparse_int low, + rocsparse_int high) { - if (low > high) + if(low > high) { return low; } rocsparse_int mid = low + ((high - low) >> 1); - if (arr[mid] >= key) + if(arr[mid] >= key) { high = mid - 1; } @@ -32,21 +31,20 @@ rocsparse_int lower_bound(const rocsparse_int *arr, return lower_bound(arr, key, low, high); } -__global__ -void coo2csr_kernel(rocsparse_int m, - rocsparse_int nnz, - const rocsparse_int *coo_row_ind, - rocsparse_int *csr_row_ptr, - rocsparse_index_base idx_base) +__global__ void coo2csr_kernel(rocsparse_int m, + rocsparse_int nnz, + const rocsparse_int* coo_row_ind, + rocsparse_int* csr_row_ptr, + rocsparse_index_base idx_base) { rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - if (gid >= m) + if(gid >= m) { return; } - if (gid == 0) + if(gid == 0) { csr_row_ptr[0] = idx_base; csr_row_ptr[m] = nnz + idx_base; @@ -54,7 +52,7 @@ void coo2csr_kernel(rocsparse_int m, } // Binary search - csr_row_ptr[gid] = lower_bound(coo_row_ind, gid+idx_base, 0, nnz-1) + idx_base; + csr_row_ptr[gid] = lower_bound(coo_row_ind, gid + idx_base, 0, nnz - 1) + idx_base; } #endif // COO2CSR_DEVICE_H diff --git a/library/src/conversion/csr2coo_device.h b/library/src/conversion/csr2coo_device.h index 7752061a..4cd6d2cd 100644 --- a/library/src/conversion/csr2coo_device.h +++ b/library/src/conversion/csr2coo_device.h @@ -9,22 +9,21 @@ #include template -__global__ -void csr2coo_kernel(rocsparse_int m, - const rocsparse_int *csr_row_ptr, - rocsparse_int *coo_row_ind, - rocsparse_index_base idx_base) +__global__ void csr2coo_kernel(rocsparse_int m, + const rocsparse_int* csr_row_ptr, + rocsparse_int* coo_row_ind, + rocsparse_index_base idx_base) { rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; rocsparse_int lid = hipThreadIdx_x % THREADS; rocsparse_int vid = gid / THREADS; rocsparse_int nvec = hipGridDim_x * hipBlockDim_x / THREADS; - for (rocsparse_int ai=vid; ai template -__device__ -void ell_width_reduce(rocsparse_int tid, rocsparse_int *data) +__device__ void ell_width_reduce(rocsparse_int tid, rocsparse_int* data) { __syncthreads(); - for (int i=NB>>1; i>0; i>>=1) + for(int i = NB >> 1; i > 0; i >>= 1) { - if (tid < i) + if(tid < i) { - data[tid] = max(data[tid], data[tid+i]); + data[tid] = max(data[tid], data[tid + i]); } __syncthreads(); @@ -28,19 +27,17 @@ void ell_width_reduce(rocsparse_int tid, rocsparse_int *data) } template -__global__ -void ell_width_kernel_part1(rocsparse_int m, - const rocsparse_int *csr_row_ptr, - rocsparse_int *workspace) +__global__ void +ell_width_kernel_part1(rocsparse_int m, const rocsparse_int* csr_row_ptr, rocsparse_int* workspace) { rocsparse_int tid = hipThreadIdx_x; rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; __shared__ rocsparse_int sdata[NB]; - if (gid < m) + if(gid < m) { - sdata[tid] = csr_row_ptr[gid+1] - csr_row_ptr[gid]; + sdata[tid] = csr_row_ptr[gid + 1] - csr_row_ptr[gid]; } else { @@ -49,33 +46,32 @@ void ell_width_kernel_part1(rocsparse_int m, ell_width_reduce(tid, sdata); - if (tid == 0) + if(tid == 0) { workspace[hipBlockIdx_x] = sdata[0]; } } template -__global__ -void ell_width_kernel_part2(rocsparse_int m, rocsparse_int *workspace) +__global__ void ell_width_kernel_part2(rocsparse_int m, rocsparse_int* workspace) { rocsparse_int tid = hipThreadIdx_x; __shared__ rocsparse_int sdata[NB]; sdata[tid] = 0; - for (rocsparse_int i=tid; i sdata[tid]) ? workspace[i] : sdata[tid]; } __syncthreads(); - if (m < 32) + if(m < 32) { - if (tid == 0) + if(tid == 0) { - for (rocsparse_int i=1; i sdata[0]) ? sdata[i] : sdata[0]; } @@ -86,25 +82,24 @@ void ell_width_kernel_part2(rocsparse_int m, rocsparse_int *workspace) ell_width_reduce(tid, sdata); } - if (tid == 0) + if(tid == 0) { workspace[0] = sdata[0]; } } template -__global__ -void csr2ell_kernel(rocsparse_int m, - const T *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - rocsparse_int ell_width, - rocsparse_int *ell_col_ind, - T *ell_val) +__global__ void csr2ell_kernel(rocsparse_int m, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_int ell_width, + rocsparse_int* ell_col_ind, + T* ell_val) { rocsparse_int ai = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - if (ai >= m) + if(ai >= m) { return; } @@ -113,9 +108,9 @@ void csr2ell_kernel(rocsparse_int m, rocsparse_int aj = csr_row_ptr[ai]; // Fill ELL matrix - for (; aj= ell_width) + if(p >= ell_width) { break; } @@ -128,7 +123,7 @@ void csr2ell_kernel(rocsparse_int m, // TODO store rownnz // Pad remaining ELL structure - for (; aj -extern "C" -rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, - const rocsparse_int *coo_row_ind, - rocsparse_int nnz, - rocsparse_int m, - rocsparse_int *csr_row_ptr, - rocsparse_index_base idx_base) +extern "C" rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, + const rocsparse_int* coo_row_ind, + rocsparse_int nnz, + rocsparse_int m, + rocsparse_int* csr_row_ptr, + rocsparse_index_base idx_base) { // Check for valid handle - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } @@ -26,34 +25,34 @@ rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, // Logging TODO bench logging log_trace(handle, "rocsparse_coo2csr", - (const void*&) coo_row_ind, + (const void*&)coo_row_ind, nnz, m, - (const void*&) csr_row_ptr, + (const void*&)csr_row_ptr, idx_base); // Check sizes - if (nnz < 0) + if(nnz < 0) { return rocsparse_status_invalid_size; } - else if (m < 0) + else if(m < 0) { return rocsparse_status_invalid_size; } // Check pointer arguments - if (coo_row_ind == nullptr) + if(coo_row_ind == nullptr) { return rocsparse_status_invalid_pointer; } - else if (csr_row_ptr == nullptr) + else if(csr_row_ptr == nullptr) { return rocsparse_status_invalid_pointer; } // Quick return if possible - if (nnz == 0 || m == 0) + if(nnz == 0 || m == 0) { return rocsparse_status_success; } @@ -62,12 +61,19 @@ rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, hipStream_t stream = handle->stream; #define COO2CSR_DIM 512 - dim3 coo2csr_blocks((m-1)/COO2CSR_DIM+1); + dim3 coo2csr_blocks((m - 1) / COO2CSR_DIM + 1); dim3 coo2csr_threads(COO2CSR_DIM); hipLaunchKernelGGL((coo2csr_kernel), - coo2csr_blocks, coo2csr_threads, 0, stream, - m, nnz, coo_row_ind, csr_row_ptr, idx_base); + coo2csr_blocks, + coo2csr_threads, + 0, + stream, + m, + nnz, + coo_row_ind, + csr_row_ptr, + idx_base); #undef COO2CSR_DIM return rocsparse_status_success; } diff --git a/library/src/conversion/rocsparse_csr2coo.cpp b/library/src/conversion/rocsparse_csr2coo.cpp index b65ed80e..3cb5a901 100644 --- a/library/src/conversion/rocsparse_csr2coo.cpp +++ b/library/src/conversion/rocsparse_csr2coo.cpp @@ -9,16 +9,15 @@ #include -extern "C" -rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, - const rocsparse_int *csr_row_ptr, - rocsparse_int nnz, - rocsparse_int m, - rocsparse_int *coo_row_ind, - rocsparse_index_base idx_base) +extern "C" rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, + const rocsparse_int* csr_row_ptr, + rocsparse_int nnz, + rocsparse_int m, + rocsparse_int* coo_row_ind, + rocsparse_index_base idx_base) { // Check for valid handle - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } @@ -26,34 +25,34 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, // Logging TODO bench logging log_trace(handle, "rocsparse_csr2coo", - (const void*&) csr_row_ptr, + (const void*&)csr_row_ptr, nnz, m, - (const void*&) coo_row_ind, + (const void*&)coo_row_ind, idx_base); // Check sizes - if (nnz < 0) + if(nnz < 0) { return rocsparse_status_invalid_size; } - else if (m < 0) + else if(m < 0) { return rocsparse_status_invalid_size; } // Check pointer arguments - if (csr_row_ptr == nullptr) + if(csr_row_ptr == nullptr) { return rocsparse_status_invalid_pointer; } - else if (coo_row_ind == nullptr) + else if(coo_row_ind == nullptr) { return rocsparse_status_invalid_pointer; } // Quick return if possible - if (nnz == 0 || m == 0) + if(nnz == 0 || m == 0) { return rocsparse_status_success; } @@ -64,79 +63,145 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, #define CSR2COO_DIM 512 rocsparse_int nnz_per_row = nnz / m; - dim3 csr2coo_blocks((m-1)/CSR2COO_DIM+1); + dim3 csr2coo_blocks((m - 1) / CSR2COO_DIM + 1); dim3 csr2coo_threads(CSR2COO_DIM); - if (handle->warp_size == 32) + if(handle->warp_size == 32) { - if (nnz_per_row < 4) + if(nnz_per_row < 4) { hipLaunchKernelGGL((csr2coo_kernel<2>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } - else if (nnz_per_row < 8) + else if(nnz_per_row < 8) { hipLaunchKernelGGL((csr2coo_kernel<4>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } - else if (nnz_per_row < 16) + else if(nnz_per_row < 16) { hipLaunchKernelGGL((csr2coo_kernel<8>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } - else if (nnz_per_row < 32) + else if(nnz_per_row < 32) { hipLaunchKernelGGL((csr2coo_kernel<16>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } else { hipLaunchKernelGGL((csr2coo_kernel<32>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } } - else if (handle->warp_size == 64) + else if(handle->warp_size == 64) { - if (nnz_per_row < 4) + if(nnz_per_row < 4) { hipLaunchKernelGGL((csr2coo_kernel<2>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } - else if (nnz_per_row < 8) + else if(nnz_per_row < 8) { hipLaunchKernelGGL((csr2coo_kernel<4>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } - else if (nnz_per_row < 16) + else if(nnz_per_row < 16) { hipLaunchKernelGGL((csr2coo_kernel<8>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } - else if (nnz_per_row < 32) + else if(nnz_per_row < 32) { hipLaunchKernelGGL((csr2coo_kernel<16>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } - else if (nnz_per_row < 64) + else if(nnz_per_row < 64) { hipLaunchKernelGGL((csr2coo_kernel<32>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } else { hipLaunchKernelGGL((csr2coo_kernel<64>), - csr2coo_blocks, csr2coo_threads, 0, stream, - m, csr_row_ptr, coo_row_ind, idx_base); + csr2coo_blocks, + csr2coo_threads, + 0, + stream, + m, + csr_row_ptr, + coo_row_ind, + idx_base); } } else diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp index 710590fc..04516e81 100644 --- a/library/src/conversion/rocsparse_csr2hyb.cpp +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -15,23 +15,23 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, rocsparse_int m, rocsparse_int n, const rocsparse_mat_descr descr, - const T *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, rocsparse_hyb_mat hyb, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type) { // Check for valid handle and matrix descriptor - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } - else if (descr == nullptr) + else if(descr == nullptr) { return rocsparse_status_invalid_pointer; } - else if (hyb == nullptr) + else if(hyb == nullptr) { return rocsparse_status_invalid_pointer; } @@ -41,56 +41,56 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, replaceX("rocsparse_Xcsr2hyb"), m, n, - (const void*&) descr, - (const void*&) csr_val, - (const void*&) csr_row_ptr, - (const void*&) csr_col_ind, - (const void*&) hyb, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)hyb, user_ell_width, partition_type); // Check matrix type - if (descr->base != rocsparse_index_base_zero) + if(descr->base != rocsparse_index_base_zero) { // TODO return rocsparse_status_not_implemented; } - if (descr->type != rocsparse_matrix_type_general) + if(descr->type != rocsparse_matrix_type_general) { // TODO return rocsparse_status_not_implemented; } - if (partition_type != rocsparse_hyb_partition_max) + if(partition_type != rocsparse_hyb_partition_max) { return rocsparse_status_not_implemented; } // Check sizes - if (m < 0) + if(m < 0) { return rocsparse_status_invalid_size; } - else if (n < 0) + else if(n < 0) { return rocsparse_status_invalid_size; } // Check pointer arguments - if (csr_val == nullptr) + if(csr_val == nullptr) { return rocsparse_status_invalid_pointer; } - else if (csr_row_ptr == nullptr) + else if(csr_row_ptr == nullptr) { return rocsparse_status_invalid_pointer; } - else if (csr_col_ind == nullptr) + else if(csr_col_ind == nullptr) { return rocsparse_status_invalid_pointer; } // Quick return if possible - if (m == 0 || n == 0) + if(m == 0 || n == 0) { return rocsparse_status_success; } @@ -106,52 +106,58 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hyb->ell_width = 0; hyb->coo_nnz = 0; - if (hyb->ell_col_ind) + if(hyb->ell_col_ind) { RETURN_IF_HIP_ERROR(hipFree(hyb->ell_col_ind)); } - if (hyb->ell_val) + if(hyb->ell_val) { RETURN_IF_HIP_ERROR(hipFree(hyb->ell_val)); } - if (hyb->coo_row_ind) + if(hyb->coo_row_ind) { RETURN_IF_HIP_ERROR(hipFree(hyb->coo_row_ind)); } - if (hyb->coo_col_ind) + if(hyb->coo_col_ind) { RETURN_IF_HIP_ERROR(hipFree(hyb->coo_col_ind)); } - if (hyb->coo_val) + if(hyb->coo_val) { RETURN_IF_HIP_ERROR(hipFree(hyb->coo_val)); } #define CSR2ELL_DIM 512 // TODO we take max partition - if (partition_type == rocsparse_hyb_partition_max) + if(partition_type == rocsparse_hyb_partition_max) { // ELL part only, compute maximum non-zeros per row rocsparse_int blocks = handle->warp_size; // Allocate workspace - rocsparse_int *workspace = NULL; - RETURN_IF_HIP_ERROR( - hipMalloc((void**) &workspace, sizeof(rocsparse_int)*blocks)); + rocsparse_int* workspace = NULL; + RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(rocsparse_int) * blocks)); hipLaunchKernelGGL((ell_width_kernel_part1), - dim3(blocks), dim3(CSR2ELL_DIM), 0, stream, - m, csr_row_ptr, workspace); + dim3(blocks), + dim3(CSR2ELL_DIM), + 0, + stream, + m, + csr_row_ptr, + workspace); hipLaunchKernelGGL((ell_width_kernel_part2), - dim3(1), dim3(CSR2ELL_DIM), 0, stream, - blocks, workspace); + dim3(1), + dim3(CSR2ELL_DIM), + 0, + stream, + blocks, + workspace); // Copy ell width back to host - RETURN_IF_HIP_ERROR(hipMemcpy(&hyb->ell_width, - workspace, - sizeof(rocsparse_int), - hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR( + hipMemcpy(&hyb->ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); RETURN_IF_HIP_ERROR(hipFree(workspace)); } else @@ -164,18 +170,24 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hyb->ell_nnz = hyb->ell_width * m; // Allocate ELL part - RETURN_IF_HIP_ERROR( - hipMalloc((void**) &hyb->ell_col_ind, sizeof(rocsparse_int)*hyb->ell_nnz)); - RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T)*hyb->ell_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->ell_col_ind, sizeof(rocsparse_int) * hyb->ell_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T) * hyb->ell_nnz)); - dim3 csr2ell_blocks((m-1)/CSR2ELL_DIM+1); + dim3 csr2ell_blocks((m - 1) / CSR2ELL_DIM + 1); dim3 csr2ell_threads(CSR2ELL_DIM); - hipLaunchKernelGGL((csr2ell_kernel), - csr2ell_blocks, csr2ell_threads, 0, stream, - m, csr_val, csr_row_ptr, csr_col_ind, - hyb->ell_width, hyb->ell_col_ind, (T*) hyb->ell_val); + csr2ell_blocks, + csr2ell_threads, + 0, + stream, + m, + csr_val, + csr_row_ptr, + csr_col_ind, + hyb->ell_width, + hyb->ell_col_ind, + (T*)hyb->ell_val); #undef CSR2ELL_DIM return rocsparse_status_success; } @@ -186,36 +198,48 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, * =========================================================================== */ -extern "C" -rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, - rocsparse_int m, - rocsparse_int n, - const rocsparse_mat_descr descr, - const float *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - rocsparse_hyb_mat hyb, - rocsparse_int user_ell_width, - rocsparse_hyb_partition partition_type) +extern "C" rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type) { - return rocsparse_csr2hyb_template(handle, m, n, - descr, csr_val, csr_row_ptr, csr_col_ind, - hyb, user_ell_width, partition_type); + return rocsparse_csr2hyb_template(handle, + m, + n, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + hyb, + user_ell_width, + partition_type); } -extern "C" -rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, - rocsparse_int m, - rocsparse_int n, - const rocsparse_mat_descr descr, - const double *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - rocsparse_hyb_mat hyb, - rocsparse_int user_ell_width, - rocsparse_hyb_partition partition_type) +extern "C" rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type) { - return rocsparse_csr2hyb_template(handle, m, n, - descr, csr_val, csr_row_ptr, csr_col_ind, - hyb, user_ell_width, partition_type); + return rocsparse_csr2hyb_template(handle, + m, + n, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + hyb, + user_ell_width, + partition_type); } diff --git a/library/src/handle.cpp b/library/src/handle.cpp index db571067..d873570d 100644 --- a/library/src/handle.cpp +++ b/library/src/handle.cpp @@ -21,8 +21,8 @@ _rocsparse_handle::_rocsparse_handle() warp_size = properties.warpSize; // Layer mode - char *str_layer_mode; - if ((str_layer_mode = getenv("ROCSPARSE_LAYER")) == NULL) + char* str_layer_mode; + if((str_layer_mode = getenv("ROCSPARSE_LAYER")) == NULL) { layer_mode = rocsparse_layer_mode_none; } @@ -32,13 +32,13 @@ _rocsparse_handle::_rocsparse_handle() } // Open log file - if (layer_mode & rocsparse_layer_mode_log_trace) + if(layer_mode & rocsparse_layer_mode_log_trace) { open_log_stream(&log_trace_os, &log_trace_ofs, "ROCSPARSE_LOG_TRACE_PATH"); } // Open log_bench file - if (layer_mode & rocsparse_layer_mode_log_bench) + if(layer_mode & rocsparse_layer_mode_log_bench) { open_log_stream(&log_bench_os, &log_bench_ofs, "ROCSPARSE_LOG_BENCH_PATH"); } @@ -50,11 +50,11 @@ _rocsparse_handle::_rocsparse_handle() _rocsparse_handle::~_rocsparse_handle() { // Close log files - if (log_trace_ofs.is_open()) + if(log_trace_ofs.is_open()) { log_trace_ofs.close(); } - if (log_bench_ofs.is_open()) + if(log_bench_ofs.is_open()) { log_bench_ofs.close(); } @@ -79,7 +79,7 @@ rocsparse_status _rocsparse_handle::set_stream(hipStream_t user_stream) /******************************************************************************* * get stream ******************************************************************************/ -rocsparse_status _rocsparse_handle::get_stream(hipStream_t *user_stream) const +rocsparse_status _rocsparse_handle::get_stream(hipStream_t* user_stream) const { *user_stream = stream; return rocsparse_status_success; diff --git a/library/src/include/definitions.h b/library/src/include/definitions.h index 51c3824f..ea888ffa 100644 --- a/library/src/include/definitions.h +++ b/library/src/include/definitions.h @@ -17,7 +17,7 @@ #define RETURN_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ { \ hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if (TMP_STATUS_FOR_CHECK != hipSuccess) \ + if(TMP_STATUS_FOR_CHECK != hipSuccess) \ { \ return get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ } \ @@ -26,7 +26,7 @@ #define THROW_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ { \ hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if (TMP_STATUS_FOR_CHECK != hipSuccess) \ + if(TMP_STATUS_FOR_CHECK != hipSuccess) \ { \ throw get_rocsparse_status_for_hip_status(TMP_STATUS_FOR_CHECK); \ } \ @@ -35,7 +35,7 @@ #define PRINT_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ { \ hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ - if (TMP_STATUS_FOR_CHECK != hipSuccess) \ + if(TMP_STATUS_FOR_CHECK != hipSuccess) \ { \ fprintf(stderr, \ "hip error code: %d at %s:%d\n", \ diff --git a/library/src/include/handle.h b/library/src/include/handle.h index d3cda017..bba65ce7 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -29,7 +29,7 @@ struct _rocsparse_handle // Set stream rocsparse_status set_stream(hipStream_t user_stream); // Get stream - rocsparse_status get_stream(hipStream_t *user_stream) const; + rocsparse_status get_stream(hipStream_t* user_stream) const; // device id rocsparse_int device; @@ -47,8 +47,8 @@ struct _rocsparse_handle // logging streams std::ofstream log_trace_ofs; std::ofstream log_bench_ofs; - std::ostream *log_trace_os; - std::ostream *log_bench_os; + std::ostream* log_trace_os; + std::ostream* log_bench_os; }; /******************************************************************************** @@ -63,9 +63,9 @@ struct _rocsparse_mat_descr // Matrix type rocsparse_matrix_type type = rocsparse_matrix_type_general; // Fill mode TODO -// rocsparse_fill_mode fill; + // rocsparse_fill_mode fill; // Diagonal type -// rocsparse_diag_type diag; + // rocsparse_diag_type diag; // Index base rocsparse_index_base base = rocsparse_index_base_zero; }; @@ -88,23 +88,23 @@ struct _rocsparse_hyb_mat rocsparse_hyb_partition partition = rocsparse_hyb_partition_auto; // ELL matrix part - rocsparse_int ell_nnz = 0; - rocsparse_int ell_width = 0; - rocsparse_int *ell_col_ind = nullptr; - void *ell_val = nullptr; + rocsparse_int ell_nnz = 0; + rocsparse_int ell_width = 0; + rocsparse_int* ell_col_ind = nullptr; + void* ell_val = nullptr; // COO matrix part - rocsparse_int coo_nnz = 0; - rocsparse_int *coo_row_ind = nullptr; - rocsparse_int *coo_col_ind = nullptr; - void *coo_val = nullptr; + rocsparse_int coo_nnz = 0; + rocsparse_int* coo_row_ind = nullptr; + rocsparse_int* coo_col_ind = nullptr; + void* coo_val = nullptr; }; /******************************************************************************** * \brief ELL format indexing *******************************************************************************/ #define ELL_IND_ROW(i, el, m, width) (el) * (m) + (i) -#define ELL_IND_EL (i, el, m, width) (el) + (width) * (i) +#define ELL_IND_EL(i, el, m, width) (el) + (width) * (i) #define ELL_IND(i, el, m, width) ELL_IND_ROW(i, el, m, width) #endif // HANDLE_H diff --git a/library/src/include/logging.h b/library/src/include/logging.h index d6c3a917..543cd308 100644 --- a/library/src/include/logging.h +++ b/library/src/include/logging.h @@ -112,7 +112,7 @@ struct log_arg { os_ << separator_ << x; } -/* + /* /// Overload () operator for rocsparse_float_complex. void operator()(const rocsparse_float_complex complex_value) const { @@ -124,7 +124,7 @@ struct log_arg { os_ << separator_ << complex_value.x << separator_ << complex_value.y; } -*/ + */ private: std::ostream& os_; ///< Output stream. std::string& separator_; ///< Separator: output preceding argument. diff --git a/library/src/include/utility.h b/library/src/include/utility.h index 91e98c32..3977fb08 100644 --- a/library/src/include/utility.h +++ b/library/src/include/utility.h @@ -66,7 +66,7 @@ std::string replaceX(std::string input_string) { std::replace(input_string.begin(), input_string.end(), 'X', 'd'); } -/* + /* else if(std::is_same::value) { std::replace(input_string.begin(), input_string.end(), 'X', 'c'); @@ -79,7 +79,7 @@ std::string replaceX(std::string input_string) { std::replace(input_string.begin(), input_string.end(), 'X', 'h'); } -*/ + */ return input_string; } diff --git a/library/src/level1/rocsparse_axpyi.cpp b/library/src/level1/rocsparse_axpyi.cpp index 32cfef41..05789e2b 100644 --- a/library/src/level1/rocsparse_axpyi.cpp +++ b/library/src/level1/rocsparse_axpyi.cpp @@ -9,44 +9,41 @@ #include template -__device__ -void axpyi_device(rocsparse_int nnz, - T alpha, - const T *x_val, - const rocsparse_int *x_ind, - T *y, - rocsparse_index_base idx_base) +__device__ void axpyi_device(rocsparse_int nnz, + T alpha, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base) { int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - if (tid >= nnz) + if(tid >= nnz) { return; } - y[x_ind[tid]-idx_base] += alpha * x_val[tid]; + y[x_ind[tid] - idx_base] += alpha * x_val[tid]; } template -__global__ -void axpyi_kernel_host_scalar(rocsparse_int nnz, - T alpha, - const T *x_val, - const rocsparse_int *x_ind, - T *y, - rocsparse_index_base idx_base) +__global__ void axpyi_kernel_host_scalar(rocsparse_int nnz, + T alpha, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base) { axpyi_device(nnz, alpha, x_val, x_ind, y, idx_base); } template -__global__ -void axpyi_kernel_device_scalar(rocsparse_int nnz, - const T *alpha, - const T *x_val, - const rocsparse_int *x_ind, - T *y, - rocsparse_index_base idx_base) +__global__ void axpyi_kernel_device_scalar(rocsparse_int nnz, + const T* alpha, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base) { axpyi_device(nnz, *alpha, x_val, x_ind, y, idx_base); } @@ -77,73 +74,72 @@ void axpyi_kernel_device_scalar(rocsparse_int nnz, template rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, rocsparse_int nnz, - const T *alpha, - const T *x_val, - const rocsparse_int *x_ind, - T *y, + const T* alpha, + const T* x_val, + const rocsparse_int* x_ind, + T* y, rocsparse_index_base idx_base) { // Check for valid handle - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } // Logging // TODO bench logging - if (handle->pointer_mode == rocsparse_pointer_mode_host) + if(handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, replaceX("rocsparse_Xaxpyi"), nnz, *alpha, - (const void*&) x_val, - (const void*&) x_ind, - (const void*&) y); + (const void*&)x_val, + (const void*&)x_ind, + (const void*&)y); } else { log_trace(handle, replaceX("rocsparse_Xaxpyi"), nnz, - (const void*&) alpha, - (const void*&) x_val, - (const void*&) x_ind, - (const void*&) y); + (const void*&)alpha, + (const void*&)x_val, + (const void*&)x_ind, + (const void*&)y); } // Check index base - if (idx_base != rocsparse_index_base_zero && - idx_base != rocsparse_index_base_one) + if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) { return rocsparse_status_invalid_value; } // Check size - if (nnz < 0) + if(nnz < 0) { return rocsparse_status_invalid_size; } // Check pointer arguments - if (alpha == nullptr) + if(alpha == nullptr) { return rocsparse_status_invalid_pointer; } - else if (x_val == nullptr) + else if(x_val == nullptr) { return rocsparse_status_invalid_pointer; } - else if (x_ind == nullptr) + else if(x_ind == nullptr) { return rocsparse_status_invalid_pointer; } - else if (y == nullptr) + else if(y == nullptr) { return rocsparse_status_invalid_pointer; } // Quick return if possible - if (nnz == 0) + if(nnz == 0) { return rocsparse_status_success; } @@ -152,25 +148,41 @@ rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, hipStream_t stream = handle->stream; #define AXPYI_DIM 256 - dim3 axpyi_blocks((nnz-1)/AXPYI_DIM+1); + dim3 axpyi_blocks((nnz - 1) / AXPYI_DIM + 1); dim3 axpyi_threads(AXPYI_DIM); - if (handle->pointer_mode == rocsparse_pointer_mode_device) + if(handle->pointer_mode == rocsparse_pointer_mode_device) { hipLaunchKernelGGL((axpyi_kernel_device_scalar), - axpyi_blocks, axpyi_threads, 0, stream, - nnz, alpha, x_val, x_ind, y, idx_base); + axpyi_blocks, + axpyi_threads, + 0, + stream, + nnz, + alpha, + x_val, + x_ind, + y, + idx_base); } else { - if (*alpha == 0.0) + if(*alpha == 0.0) { return rocsparse_status_success; } hipLaunchKernelGGL((axpyi_kernel_host_scalar), - axpyi_blocks, axpyi_threads, 0, stream, - nnz, *alpha, x_val, x_ind, y, idx_base); + axpyi_blocks, + axpyi_threads, + 0, + stream, + nnz, + *alpha, + x_val, + x_ind, + y, + idx_base); } #undef AXPYI_DIM return rocsparse_status_success; @@ -182,26 +194,24 @@ rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, * =========================================================================== */ -extern "C" -rocsparse_status rocsparse_saxpyi(rocsparse_handle handle, - rocsparse_int nnz, - const float *alpha, - const float *x_val, - const rocsparse_int *x_ind, - float *y, - rocsparse_index_base idx_base) +extern "C" rocsparse_status rocsparse_saxpyi(rocsparse_handle handle, + rocsparse_int nnz, + const float* alpha, + const float* x_val, + const rocsparse_int* x_ind, + float* y, + rocsparse_index_base idx_base) { return rocsparse_axpyi_template(handle, nnz, alpha, x_val, x_ind, y, idx_base); } -extern "C" -rocsparse_status rocsparse_daxpyi(rocsparse_handle handle, - rocsparse_int nnz, - const double *alpha, - const double *x_val, - const rocsparse_int *x_ind, - double *y, - rocsparse_index_base idx_base) +extern "C" rocsparse_status rocsparse_daxpyi(rocsparse_handle handle, + rocsparse_int nnz, + const double* alpha, + const double* x_val, + const rocsparse_int* x_ind, + double* y, + rocsparse_index_base idx_base) { return rocsparse_axpyi_template(handle, nnz, alpha, x_val, x_ind, y, idx_base); } diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index d1d5c579..d250fba6 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -28,8 +28,7 @@ // error value as a result of the 2sum calculation. // Returns: The non-corrected sum of inputs x and y. template -static __device__ -T two_sum(T x, T y, T *sumk_err) +static __device__ T two_sum(T x, T y, T* sumk_err) { const T sumk_s = x + y; #ifdef EXTENDED_PRECISION @@ -51,16 +50,17 @@ T two_sum(T x, T y, T *sumk_err) // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51, // 1974), respectively. - if (fabs(x) < fabs(y)) + if(fabs(x) < fabs(y)) { const T swap = x; + x = y; y = swap; } (*sumk_err) += (y - (sumk_s - x)); - // Original 6 FLOP 2Sum algorithm. - //T bp = sumk_s - x; - //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp)); +// Original 6 FLOP 2Sum algorithm. +// T bp = sumk_s - x; +// (*sumk_err) += ((x - (sumk_s - bp)) + (y - bp)); #endif return sumk_s; } @@ -71,22 +71,23 @@ T two_sum(T x, T y, T *sumk_err) // if we are in EXTENDED_PRECISION mode, this function devolves into two_sum // with x_vals and x_vec inputs multiplied separately from the compensated add. template -static __device__ -T two_fma(T x_vals, T x_vec, T y, T *sumk_err) +static __device__ T two_fma(T x_vals, T x_vec, T y, T* sumk_err) { #ifdef EXTENDED_PRECISION T x = x_vals * x_vec; + const T sumk_s = x + y; - if (fabs(x) < fabs(y)) + if(fabs(x) < fabs(y)) { const T swap = x; + x = y; y = swap; } (*sumk_err) += (y - (sumk_s - x)); // 2Sum in the FMA case. Poor performance on low-DPFP GPUs. - //const T bp = fma(-x_vals, x_vec, sumk_s); - //(*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp)); + // const T bp = fma(-x_vals, x_vec, sumk_s); + // (*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp)); return sumk_s; #else return fma(x_vals, x_vec, y); @@ -107,29 +108,26 @@ T two_fma(T x_vals, T x_vec, T y, T *sumk_err) // round: This parallel summation method operates in multiple rounds // to do a parallel reduction. See the blow comment for usage. template -static __device__ -T sum2_reduce(T cur_sum, T *err, - volatile T *partial, - int lid, - int thread_lane, - int round) +static __device__ T +sum2_reduce(T cur_sum, T* err, volatile T* partial, int lid, int thread_lane, int round) { - if (SUBWAVE_SIZE > round) + if(SUBWAVE_SIZE > round) { #ifdef EXTENDED_PRECISION const unsigned int partial_dest = lid + round; - if (thread_lane < round) - cur_sum = two_sum(cur_sum, partial[partial_dest], err); + if(thread_lane < round) + cur_sum = two_sum(cur_sum, partial[partial_dest], err); // We reuse the LDS entries to move the error values down into lower // threads. This saves LDS space, allowing higher occupancy, but requires // more barriers, which can reduce performance. __syncthreads(); // Have all of those upper threads pass their temporary errors // into a location that the lower threads can read. - if (thread_lane >= round) + if(thread_lane >= round) partial[lid] = *err; __syncthreads(); - if (thread_lane < round) { // Add those errors in. + if(thread_lane < round) + { // Add those errors in. *err += partial[partial_dest]; partial[lid] = cur_sum; } @@ -151,67 +149,68 @@ T sum2_reduce(T cur_sum, T *err, // WG_SIZE - workgroup ("block") size, 1D representation assumed // int - typename for the type of integer data read by the kernel, usually unsigned int // T - typename for the type of floating point data, usually double -// SUBWAVE_SIZE - the length of a "sub-wave", a power of 2, i.e. 1,2,4,...,WAVE_SIZE, assigned to process a single matrix row +// SUBWAVE_SIZE - the length of a "sub-wave", a power of 2, i.e. 1,2,4,...,WAVE_SIZE, assigned to +// process a single matrix row template -static __device__ -//__attribute__((reqd_work_group_size(WG_SIZE,1,1))) -void csrmvn_general_device(int num_rows, - T alpha, - const int *row_offset, - const int *col, - const T *val, - const T *x, - T beta, - T *y) +static __device__ __launch_bounds__(WG_SIZE, 1) void csrmvn_general_device(int num_rows, + T alpha, + const int* row_offset, + const int* col, + const T* val, + const T* x, + T beta, + T* y) { - __shared__ volatile T sdata [WG_SIZE + SUBWAVE_SIZE / 2]; + __shared__ volatile T sdata[WG_SIZE + SUBWAVE_SIZE / 2]; - //const int vectors_per_block = WG_SIZE/SUBWAVE_SIZE; + // const int vectors_per_block = WG_SIZE/SUBWAVE_SIZE; const int global_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // global workitem id - const int local_id = hipThreadIdx_x; // local workitem id + const int local_id = hipThreadIdx_x; // local workitem id const int thread_lane = local_id & (SUBWAVE_SIZE - 1); const int vector_id = global_id / SUBWAVE_SIZE; // global vector id - //const int vector_lane = local_id / SUBWAVE_SIZE; // vector id within the workgroup + // const int vector_lane = local_id / SUBWAVE_SIZE; // vector id within the workgroup const int num_vectors = hipGridDim_x * hipBlockDim_x / SUBWAVE_SIZE; for(int row = vector_id; row < num_rows; row += num_vectors) { const int row_start = row_offset[row]; - const int row_end = row_offset[row+1]; - T sum = 0.; + const int row_end = row_offset[row + 1]; + T sum = 0.; T sumk_e = 0.; + // It is about 5% faster to always multiply by alpha, rather than to // check whether alpha is 0, 1, or other and do different code paths. for(int j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE) sum = two_fma(alpha * val[j], x[col[j]], sum, &sumk_e); + T new_error = 0.; - sum = two_sum(sum, sumk_e, &new_error); + sum = two_sum(sum, sumk_e, &new_error); // Parallel reduction in shared memory. sdata[local_id] = sum; - // This compensated summation reduces cummulative rounding errors, - // which can become a problem on GPUs because our reduction order is - // different than what would be used on a CPU. - // It is based on the PSumK algorithm (with K==2) from - // Yamanaka, Ogita, Rump, and Oishi, "A Parallel Algorithm of - // Accurate Dot Product," in the Journal of Parallel Computing, - // 34(6-8), pp. 392-410, Jul. 2008. - #pragma unroll - for (int i = (WG_SIZE >> 1); i > 0; i >>= 1) +// This compensated summation reduces cummulative rounding errors, +// which can become a problem on GPUs because our reduction order is +// different than what would be used on a CPU. +// It is based on the PSumK algorithm (with K==2) from +// Yamanaka, Ogita, Rump, and Oishi, "A Parallel Algorithm of +// Accurate Dot Product," in the Journal of Parallel Computing, +// 34(6-8), pp. 392-410, Jul. 2008. +#pragma unroll + for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) { __syncthreads(); sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, i); } - if (thread_lane == 0) + if(thread_lane == 0) { - if (beta == 0) + if(beta == 0) y[row] = sum + new_error; else { - sum = two_fma(beta, y[row], sum, &new_error); + sum = two_fma(beta, y[row], sum, &new_error); y[row] = sum + new_error; } } diff --git a/library/src/level2/ellmv_device.h b/library/src/level2/ellmv_device.h index 9c62f7bb..1a003edd 100644 --- a/library/src/level2/ellmv_device.h +++ b/library/src/level2/ellmv_device.h @@ -7,37 +7,36 @@ #include template -static __device__ -void ellmvn_device(rocsparse_int m, - rocsparse_int n, - rocsparse_int ell_width, - T alpha, - const rocsparse_int *ell_col_ind, - const T *ell_val, - const T *x, - T beta, - T *y) +static __device__ void ellmvn_device(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + T alpha, + const rocsparse_int* ell_col_ind, + const T* ell_val, + const T* x, + T beta, + T* y) { rocsparse_int ai = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - if (ai >= m) + if(ai >= m) { return; } T sum = static_cast(0); - for (rocsparse_int p=0; p= 0 && col < n) + if(col >= 0 && col < n) { sum += ell_val[idx] * x[col]; } } - if (beta != static_cast(0)) + if(beta != static_cast(0)) { y[ai] = beta * y[ai] + alpha * sum; } diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 859f36ed..1edc288d 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -10,30 +10,28 @@ #include template -__global__ -void csrmvn_kernel_host_pointer(rocsparse_int m, - T alpha, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const T *csr_val, - const T *x, - T beta, - T *y) +__global__ void csrmvn_kernel_host_pointer(rocsparse_int m, + T alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* x, + T beta, + T* y) { csrmvn_general_device( m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); } template -__global__ -void csrmvn_kernel_device_pointer(rocsparse_int m, - const T *alpha, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const T *csr_val, - const T *x, - const T *beta, - T *y) +__global__ void csrmvn_kernel_device_pointer(rocsparse_int m, + const T* alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* x, + const T* beta, + T* y) { csrmvn_general_device( m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); @@ -82,120 +80,123 @@ void csrmvn_kernel_device_pointer(rocsparse_int m, ********************************************************************/ template rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, rocsparse_int nnz, - const T *alpha, - const rocsparse_mat_descr descr, - const T *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const T *x, - const T *beta, - T *y) + const T* alpha, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* x, + const T* beta, + T* y) { // Check for valid handle and matrix descriptor - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } - else if (descr == nullptr) + else if(descr == nullptr) { return rocsparse_status_invalid_pointer; } // Logging TODO bench logging - if (handle->pointer_mode == rocsparse_pointer_mode_host) + if(handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, replaceX("rocsparse_Xcsrmv"), trans, - m, n, nnz, + m, + n, + nnz, *alpha, - (const void*&) descr, - (const void*&) csr_val, - (const void*&) csr_row_ptr, - (const void*&) csr_col_ind, - (const void*&) x, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)x, *beta, - (const void*&) y); + (const void*&)y); } else { log_trace(handle, replaceX("rocsparse_Xcsrmv"), trans, - m, n, nnz, - (const void*&) alpha, - (const void*&) descr, - (const void*&) csr_val, - (const void*&) csr_row_ptr, - (const void*&) csr_col_ind, - (const void*&) x, - (const void*&) beta, - (const void*&) y); + m, + n, + nnz, + (const void*&)alpha, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)x, + (const void*&)beta, + (const void*&)y); } // Check matrix type - if (descr->base != rocsparse_index_base_zero) + if(descr->base != rocsparse_index_base_zero) { // TODO return rocsparse_status_not_implemented; } - if (descr->type != rocsparse_matrix_type_general) + if(descr->type != rocsparse_matrix_type_general) { // TODO return rocsparse_status_not_implemented; } - // Check sizes - if (m < 0) + if(m < 0) { return rocsparse_status_invalid_size; } - else if (n < 0) + else if(n < 0) { return rocsparse_status_invalid_size; } - else if (nnz < 0) + else if(nnz < 0) { return rocsparse_status_invalid_size; } // Check pointer arguments - if (csr_val == nullptr) + if(csr_val == nullptr) { return rocsparse_status_invalid_pointer; } - else if (csr_row_ptr == nullptr) + else if(csr_row_ptr == nullptr) { return rocsparse_status_invalid_pointer; } - else if (csr_col_ind == nullptr) + else if(csr_col_ind == nullptr) { return rocsparse_status_invalid_pointer; } - else if (x == nullptr) + else if(x == nullptr) { return rocsparse_status_invalid_pointer; } - else if (y == nullptr) + else if(y == nullptr) { return rocsparse_status_invalid_pointer; } - else if (alpha == nullptr) + else if(alpha == nullptr) { return rocsparse_status_invalid_pointer; } - else if (beta == nullptr) + else if(beta == nullptr) { return rocsparse_status_invalid_pointer; } // Quick return if possible - if (m == 0 || n == 0 || nnz == 0) + if(m == 0 || n == 0 || nnz == 0) { return rocsparse_status_success; } @@ -204,86 +205,196 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, hipStream_t stream = handle->stream; // Run different csrmv kernels - if (trans == rocsparse_operation_none) + if(trans == rocsparse_operation_none) { #define CSRMVN_DIM 512 rocsparse_int nnz_per_row = nnz / m; - dim3 csrmvn_blocks((m-1)/CSRMVN_DIM+1); + dim3 csrmvn_blocks((m - 1) / CSRMVN_DIM + 1); dim3 csrmvn_threads(CSRMVN_DIM); - if (handle->pointer_mode == rocsparse_pointer_mode_device) + if(handle->pointer_mode == rocsparse_pointer_mode_device) { - if (handle->warp_size == 32) + if(handle->warp_size == 32) { - if (nnz_per_row < 4) + if(nnz_per_row < 4) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } - else if (nnz_per_row < 8) + else if(nnz_per_row < 8) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } - else if (nnz_per_row < 16) + else if(nnz_per_row < 16) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } - else if (nnz_per_row < 32) + else if(nnz_per_row < 32) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } else { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } } - else if (handle->warp_size == 64) + else if(handle->warp_size == 64) { - if (nnz_per_row < 4) + if(nnz_per_row < 4) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } - else if (nnz_per_row < 8) + else if(nnz_per_row < 8) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } - else if (nnz_per_row < 16) + else if(nnz_per_row < 16) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } - else if (nnz_per_row < 32) + else if(nnz_per_row < 32) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } - else if (nnz_per_row < 64) + else if(nnz_per_row < 64) { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } else { hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y); } } else @@ -293,81 +404,191 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else { - if (*alpha == 0.0 && *beta == 1.0) + if(*alpha == 0.0 && *beta == 1.0) { return rocsparse_status_success; } - if (handle->warp_size == 32) + if(handle->warp_size == 32) { - if (nnz_per_row < 4) + if(nnz_per_row < 4) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } - else if (nnz_per_row < 8) + else if(nnz_per_row < 8) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } - else if (nnz_per_row < 16) + else if(nnz_per_row < 16) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } - else if (nnz_per_row < 32) + else if(nnz_per_row < 32) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } else { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } } - else if (handle->warp_size == 64) + else if(handle->warp_size == 64) { - if (nnz_per_row < 4) + if(nnz_per_row < 4) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } - else if (nnz_per_row < 8) + else if(nnz_per_row < 8) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } - else if (nnz_per_row < 16) + else if(nnz_per_row < 16) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } - else if (nnz_per_row < 32) + else if(nnz_per_row < 32) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } - else if (nnz_per_row < 64) + else if(nnz_per_row < 64) { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } else { hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, csrmvn_threads, 0, stream, - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y); } } else @@ -391,42 +612,38 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, * =========================================================================== */ -extern "C" -rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const float *alpha, - const rocsparse_mat_descr descr, - const float *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const float *x, - const float *beta, - float *y) +extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const float* x, + const float* beta, + float* y) { return rocsparse_csrmv_template( - handle, trans, m, n, nnz, alpha, descr, - csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } - -extern "C" -rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const double *alpha, - const rocsparse_mat_descr descr, - const double *csr_val, - const rocsparse_int *csr_row_ptr, - const rocsparse_int *csr_col_ind, - const double *x, - const double *beta, - double *y) + +extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const double* x, + const double* beta, + double* y) { return rocsparse_csrmv_template( - handle, trans, m, n, nnz, alpha, descr, - csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } diff --git a/library/src/level2/rocsparse_hybmv.cpp b/library/src/level2/rocsparse_hybmv.cpp index c6035876..93e3bca5 100644 --- a/library/src/level2/rocsparse_hybmv.cpp +++ b/library/src/level2/rocsparse_hybmv.cpp @@ -10,31 +10,29 @@ #include template -__global__ -void ellmvn_kernel_host_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int ell_width, - T alpha, - const rocsparse_int *ell_col_ind, - const T *ell_val, - const T *x, - T beta, - T *y) +__global__ void ellmvn_kernel_host_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + T alpha, + const rocsparse_int* ell_col_ind, + const T* ell_val, + const T* x, + T beta, + T* y) { ellmvn_device(m, n, ell_width, alpha, ell_col_ind, ell_val, x, beta, y); } template -__global__ -void ellmvn_kernel_device_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int ell_width, - const T *alpha, - const rocsparse_int *ell_col_ind, - const T *ell_val, - const T *x, - const T *beta, - T *y) +__global__ void ellmvn_kernel_device_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + const T* alpha, + const rocsparse_int* ell_col_ind, + const T* ell_val, + const T* x, + const T* beta, + T* y) { ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y); } @@ -42,138 +40,137 @@ void ellmvn_kernel_device_pointer(rocsparse_int m, template rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, rocsparse_operation trans, - const T *alpha, + const T* alpha, const rocsparse_mat_descr descr, const rocsparse_hyb_mat hyb, - const T *x, - const T *beta, - T *y) + const T* x, + const T* beta, + T* y) { // Check for valid handle and matrix descriptor - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } - else if (descr == nullptr) + else if(descr == nullptr) { return rocsparse_status_invalid_pointer; } - else if (hyb == nullptr) + else if(hyb == nullptr) { return rocsparse_status_invalid_pointer; } // Logging TODO bench logging - if (handle->pointer_mode == rocsparse_pointer_mode_host) + if(handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, replaceX("rocsparse_Xhybmv"), trans, *alpha, - (const void*&) descr, - (const void*&) hyb, - (const void*&) x, + (const void*&)descr, + (const void*&)hyb, + (const void*&)x, *beta, - (const void*&) y); + (const void*&)y); } else { log_trace(handle, replaceX("rocsparse_Xhybmv"), trans, - (const void*&) alpha, - (const void*&) descr, - (const void*&) hyb, - (const void*&) x, - (const void*&) beta, - (const void*&) y); + (const void*&)alpha, + (const void*&)descr, + (const void*&)hyb, + (const void*&)x, + (const void*&)beta, + (const void*&)y); } // Check matrix type - if (descr->base != rocsparse_index_base_zero) + if(descr->base != rocsparse_index_base_zero) { // TODO return rocsparse_status_not_implemented; } - if (descr->type != rocsparse_matrix_type_general) + if(descr->type != rocsparse_matrix_type_general) { // TODO return rocsparse_status_not_implemented; } - if (hyb->partition != rocsparse_hyb_partition_max) + if(hyb->partition != rocsparse_hyb_partition_max) { return rocsparse_status_not_implemented; } // Check sizes - if (hyb->m < 0) + if(hyb->m < 0) { return rocsparse_status_invalid_size; } - else if (hyb->n < 0) + else if(hyb->n < 0) { return rocsparse_status_invalid_size; } - else if (hyb->ell_nnz + hyb->coo_nnz < 0) + else if(hyb->ell_nnz + hyb->coo_nnz < 0) { return rocsparse_status_invalid_size; } // Check ELL-HYB structure - if (hyb->ell_nnz > 0) + if(hyb->ell_nnz > 0) { - if (hyb->ell_width < 0) + if(hyb->ell_width < 0) { return rocsparse_status_invalid_size; } - else if (hyb->ell_col_ind == nullptr) + else if(hyb->ell_col_ind == nullptr) { return rocsparse_status_invalid_pointer; } - else if (hyb->ell_val == nullptr) + else if(hyb->ell_val == nullptr) { return rocsparse_status_invalid_pointer; } } // Check COO-HYB structure - if (hyb->coo_nnz > 0) + if(hyb->coo_nnz > 0) { - if (hyb->coo_row_ind == nullptr) + if(hyb->coo_row_ind == nullptr) { return rocsparse_status_invalid_pointer; } - else if (hyb->coo_col_ind == nullptr) + else if(hyb->coo_col_ind == nullptr) { return rocsparse_status_invalid_pointer; } - else if (hyb->coo_val == nullptr) + else if(hyb->coo_val == nullptr) { return rocsparse_status_invalid_pointer; } } // Check pointer arguments - if (x == nullptr) + if(x == nullptr) { return rocsparse_status_invalid_pointer; } - else if (y == nullptr) + else if(y == nullptr) { return rocsparse_status_invalid_pointer; } - else if (alpha == nullptr) + else if(alpha == nullptr) { return rocsparse_status_invalid_pointer; } - else if (beta == nullptr) + else if(beta == nullptr) { return rocsparse_status_invalid_pointer; } // Quick return if possible - if (hyb->m == 0 || hyb->n == 0 || - hyb->ell_nnz + hyb->coo_nnz == 0) + if(hyb->m == 0 || hyb->n == 0 || hyb->ell_nnz + hyb->coo_nnz == 0) { return rocsparse_status_success; } @@ -182,32 +179,40 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, hipStream_t stream = handle->stream; // Run different hybmv kernels - if (trans == rocsparse_operation_none) + if(trans == rocsparse_operation_none) { #define ELLMVN_DIM 512 - dim3 ellmvn_blocks((hyb->m-1)/ELLMVN_DIM+1); + dim3 ellmvn_blocks((hyb->m - 1) / ELLMVN_DIM + 1); dim3 ellmvn_threads(ELLMVN_DIM); - if (handle->pointer_mode == rocsparse_pointer_mode_device) + if(handle->pointer_mode == rocsparse_pointer_mode_device) { } else { - if (*alpha == 0.0 && *beta == 1.0) + if(*alpha == 0.0 && *beta == 1.0) { return rocsparse_status_success; } // ELL part - if (hyb->ell_nnz > 0) + if(hyb->ell_nnz > 0) { hipLaunchKernelGGL((ellmvn_kernel_host_pointer), - ellmvn_blocks, ellmvn_threads, 0, stream, - hyb->m, hyb->n, hyb->ell_width, *alpha, - hyb->ell_col_ind, (T*) hyb->ell_val, - x, *beta, y); + ellmvn_blocks, + ellmvn_threads, + 0, + stream, + hyb->m, + hyb->n, + hyb->ell_width, + *alpha, + hyb->ell_col_ind, + (T*)hyb->ell_val, + x, + *beta, + y); } - } #undef ELLMVN_DIM } @@ -225,30 +230,26 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, * =========================================================================== */ -extern "C" -rocsparse_status rocsparse_shybmv(rocsparse_handle handle, - rocsparse_operation trans, - const float *alpha, - const rocsparse_mat_descr descr, - const rocsparse_hyb_mat hyb, - const float *x, - const float *beta, - float *y) +extern "C" rocsparse_status rocsparse_shybmv(rocsparse_handle handle, + rocsparse_operation trans, + const float* alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const float* x, + const float* beta, + float* y) { - return rocsparse_hybmv_template(handle, trans, alpha, - descr, hyb, x, beta, y); + return rocsparse_hybmv_template(handle, trans, alpha, descr, hyb, x, beta, y); } -extern "C" -rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, - rocsparse_operation trans, - const double *alpha, - const rocsparse_mat_descr descr, - const rocsparse_hyb_mat hyb, - const double *x, - const double *beta, - double *y) +extern "C" rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, + rocsparse_operation trans, + const double* alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const double* x, + const double* beta, + double* y) { - return rocsparse_hybmv_template(handle, trans, alpha, - descr, hyb, x, beta, y); + return rocsparse_hybmv_template(handle, trans, alpha, descr, hyb, x, beta, y); } diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index 3de0b09a..27f90b15 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -19,10 +19,10 @@ extern "C" { * to all subsequent library function calls. * It should be destroyed at the end using rocsparse_destroy_handle(). *******************************************************************************/ -rocsparse_status rocsparse_create_handle(rocsparse_handle *handle) +rocsparse_status rocsparse_create_handle(rocsparse_handle* handle) { // Check if handle is valid - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } @@ -34,7 +34,7 @@ rocsparse_status rocsparse_create_handle(rocsparse_handle *handle) *handle = new _rocsparse_handle(); log_trace(*handle, "rocsparse_create_handle"); } - catch (const rocsparse_status &status) + catch(const rocsparse_status& status) { return status; } @@ -53,7 +53,7 @@ rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle) { delete handle; } - catch (const rocsparse_status &status) + catch(const rocsparse_status& status) { return status; } @@ -64,11 +64,10 @@ rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle) * \brief Indicates whether the scalar value pointers are on the host or device. * Set pointer mode, can be host or device *******************************************************************************/ -rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, - rocsparse_pointer_mode mode) +rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, rocsparse_pointer_mode mode) { // Check if handle is valid - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } @@ -80,11 +79,10 @@ rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, /******************************************************************************** * \brief Get pointer mode, can be host or device. *******************************************************************************/ -rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, - rocsparse_pointer_mode *mode) +rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, rocsparse_pointer_mode* mode) { // Check if handle is valid - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } @@ -100,7 +98,7 @@ rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, rocsparse_status rocsparse_set_stream(rocsparse_handle handle, hipStream_t stream_id) { // Check if handle is valid - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } @@ -111,10 +109,10 @@ rocsparse_status rocsparse_set_stream(rocsparse_handle handle, hipStream_t strea /******************************************************************************** *! \brief Get rocsparse stream used for all subsequent library function calls. *******************************************************************************/ -rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t *stream_id) +rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t* stream_id) { // Check if handle is valid - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } @@ -128,16 +126,15 @@ rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t *stre * version / 100 % 1000 = minor version * version / 100000 = major version *******************************************************************************/ -rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version) +rocsparse_status rocsparse_get_version(rocsparse_handle handle, rocsparse_int* version) { // Check if handle is valid - if (handle == nullptr) + if(handle == nullptr) { return rocsparse_status_invalid_handle; } - *version = ROCSPARSE_VERSION_MAJOR * 100000 - + ROCSPARSE_VERSION_MINOR * 100 - + ROCSPARSE_VERSION_PATCH; + *version = + ROCSPARSE_VERSION_MAJOR * 100000 + ROCSPARSE_VERSION_MINOR * 100 + ROCSPARSE_VERSION_PATCH; log_trace(handle, "rocsparse_get_version", *version); return rocsparse_status_success; } @@ -149,9 +146,9 @@ rocsparse_status rocsparse_get_version(rocsparse_handle handle, int *version) * calls that involve the matrix. * It should be destroyed at the end using rocsparse_destroy_mat_descr(). *******************************************************************************/ -rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descr) +rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr* descr) { - if (descr == nullptr) + if(descr == nullptr) { return rocsparse_status_invalid_pointer; } @@ -162,7 +159,7 @@ rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr *descr) { *descr = new _rocsparse_mat_descr; } - catch (const rocsparse_status &status) + catch(const rocsparse_status& status) { return status; } @@ -180,7 +177,7 @@ rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr) { delete descr; } - catch (const rocsparse_status &status) + catch(const rocsparse_status& status) { return status; } @@ -190,15 +187,14 @@ rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr) /******************************************************************************** * \brief Set the index base of the matrix descriptor. *******************************************************************************/ -rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, - rocsparse_index_base base) +rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, rocsparse_index_base base) { // Check if descriptor is valid - if (descr == nullptr) + if(descr == nullptr) { return rocsparse_status_invalid_pointer; } - if (base != rocsparse_index_base_zero && base != rocsparse_index_base_one) + if(base != rocsparse_index_base_zero && base != rocsparse_index_base_one) { return rocsparse_status_invalid_value; } @@ -212,7 +208,7 @@ rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr descr) { // If descriptor is invalid, default index base is returned - if (descr == nullptr) + if(descr == nullptr) { return rocsparse_index_base_zero; } @@ -222,17 +218,15 @@ rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr desc /******************************************************************************** * \brief Set the matrix type of the matrix descriptor. *******************************************************************************/ -rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, - rocsparse_matrix_type type) +rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, rocsparse_matrix_type type) { // Check if descriptor is valid - if (descr == nullptr) + if(descr == nullptr) { return rocsparse_status_invalid_pointer; } - if (type != rocsparse_matrix_type_general && - type != rocsparse_matrix_type_symmetric && - type != rocsparse_matrix_type_hermitian) + if(type != rocsparse_matrix_type_general && type != rocsparse_matrix_type_symmetric && + type != rocsparse_matrix_type_hermitian) { return rocsparse_status_invalid_value; } @@ -246,7 +240,7 @@ rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr) { // If descriptor is invalid, default matrix type is returned - if (descr == nullptr) + if(descr == nullptr) { return rocsparse_matrix_type_general; } @@ -260,9 +254,9 @@ rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr) * calls that involve the HYB matrix. * It should be destroyed at the end using rocsparse_destroy_hyb_mat(). *******************************************************************************/ -rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat *hyb) +rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat* hyb) { - if (hyb == nullptr) + if(hyb == nullptr) { return rocsparse_status_invalid_pointer; } @@ -273,7 +267,7 @@ rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat *hyb) { *hyb = new _rocsparse_hyb_mat; } - catch (const rocsparse_status &status) + catch(const rocsparse_status& status) { return status; } @@ -291,7 +285,7 @@ rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb) { delete hyb; } - catch (const rocsparse_status &status) + catch(const rocsparse_status& status) { return status; } diff --git a/library/src/status.cpp b/library/src/status.cpp index 5b447f4f..f377e541 100644 --- a/library/src/status.cpp +++ b/library/src/status.cpp @@ -15,33 +15,33 @@ ******************************************************************************/ rocsparse_status get_rocsparse_status_for_hip_status(hipError_t status) { - switch (status) + switch(status) { - // success - case hipSuccess: - return rocsparse_status_success; - - // internal hip memory allocation - case hipErrorMemoryAllocation: - case hipErrorLaunchOutOfResources: - return rocsparse_status_memory_error; - - // user-allocated hip memory - case hipErrorInvalidDevicePointer: // hip memory - return rocsparse_status_invalid_pointer; - - // user-allocated device, stream, event - case hipErrorInvalidDevice: - case hipErrorInvalidResourceHandle: - return rocsparse_status_invalid_handle; - - // library using hip incorrectly - case hipErrorInvalidValue: - return rocsparse_status_internal_error; - - // hip runtime failing - case hipErrorNoDevice: // no hip devices - case hipErrorUnknown: - default: return rocsparse_status_internal_error; + // success + case hipSuccess: + return rocsparse_status_success; + + // internal hip memory allocation + case hipErrorMemoryAllocation: + case hipErrorLaunchOutOfResources: + return rocsparse_status_memory_error; + + // user-allocated hip memory + case hipErrorInvalidDevicePointer: // hip memory + return rocsparse_status_invalid_pointer; + + // user-allocated device, stream, event + case hipErrorInvalidDevice: + case hipErrorInvalidResourceHandle: + return rocsparse_status_invalid_handle; + + // library using hip incorrectly + case hipErrorInvalidValue: + return rocsparse_status_internal_error; + + // hip runtime failing + case hipErrorNoDevice: // no hip devices + case hipErrorUnknown: + default: return rocsparse_status_internal_error; } } From 109a792f1419c7b5acfe1fb7d30c3038f2f9a924 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 15 May 2018 20:05:52 +0200 Subject: [PATCH 066/304] launchbounds fix in csrmv --- library/src/level2/csrmv_device.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index d250fba6..faf51a3e 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -152,14 +152,14 @@ sum2_reduce(T cur_sum, T* err, volatile T* partial, int lid, int thread_lane, in // SUBWAVE_SIZE - the length of a "sub-wave", a power of 2, i.e. 1,2,4,...,WAVE_SIZE, assigned to // process a single matrix row template -static __device__ __launch_bounds__(WG_SIZE, 1) void csrmvn_general_device(int num_rows, - T alpha, - const int* row_offset, - const int* col, - const T* val, - const T* x, - T beta, - T* y) +static __device__ void csrmvn_general_device(int num_rows, + T alpha, + const int* row_offset, + const int* col, + const T* val, + const T* x, + T beta, + T* y) { __shared__ volatile T sdata[WG_SIZE + SUBWAVE_SIZE / 2]; From 6f6d6782e4099691dc68c88e750f6c9d3ec926cf Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 18 May 2018 08:48:53 +0200 Subject: [PATCH 067/304] samples: nvcc compiler fix --- clients/samples/CMakeLists.txt | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/clients/samples/CMakeLists.txt b/clients/samples/CMakeLists.txt index b2238fee..9cdbb3e9 100644 --- a/clients/samples/CMakeLists.txt +++ b/clients/samples/CMakeLists.txt @@ -16,17 +16,26 @@ function(add_rocsparse_example EXAMPLE_SOURCE) $ ) - target_link_libraries(${EXAMPLE_TARGET} - PRIVATE - rocsparse - hip::hip_hcc - ) - foreach(amdgpu_target ${AMDGPU_TARGETS}) + if(HIP_PLATFORM STREQUAL "hcc") + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + rocsparse + hip::hip_hcc + ) + foreach(amdgpu_target ${AMDGPU_TARGETS}) + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + --amdgpu-target=${amdgpu_target} + ) + endforeach() + endif() + + if(HIP_PLATFORM STREQUAL "nvcc") target_link_libraries(${EXAMPLE_TARGET} PRIVATE - --amdgpu-target=${amdgpu_target} + rocsparse ) - endforeach() + endif() set_target_properties(${EXAMPLE_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example" From 122ca6ee81bc03513f6d8525666e8d0972830d06 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 18 May 2018 08:49:07 +0200 Subject: [PATCH 068/304] jenkins: removed clang check --- Jenkinsfile | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 48422b1b..3ffc7a82 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -208,19 +208,19 @@ def docker_build_inside_image( def build_image, compiler_data compiler_args, doc """ archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true - stage('Clang Format') - { - sh ''' - find . -iname \'*.h\' \ - -o -iname \'*.hpp\' \ - -o -iname \'*.cpp\' \ - -o -iname \'*.h.in\' \ - -o -iname \'*.hpp.in\' \ - -o -iname \'*.cpp.in\' \ - | grep -v 'build/' \ - | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' - ''' - } +// stage('Clang Format') +// { +// sh ''' +// find . -iname \'*.h\' \ +// -o -iname \'*.hpp\' \ +// -o -iname \'*.cpp\' \ +// -o -iname \'*.h.in\' \ +// -o -iname \'*.hpp.in\' \ +// -o -iname \'*.cpp.in\' \ +// | grep -v 'build/' \ +// | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' +// ''' +// } } else if( paths.project_name.equalsIgnoreCase( 'rocsparse-fedora' ) ) { From 3a961f381a8bd60809142a1e8ce8b0053fecb2ef Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 23 May 2018 19:16:23 +0200 Subject: [PATCH 069/304] coomv for non-transposed general matrix supporting only beta==0 || beta==1 --- library/src/CMakeLists.txt | 1 + library/src/level2/coomv_device.h | 197 ++++++++++++++ library/src/level2/rocsparse_coomv.cpp | 362 +++++++++++++++++++++++++ 3 files changed, 560 insertions(+) create mode 100644 library/src/level2/coomv_device.h create mode 100644 library/src/level2/rocsparse_coomv.cpp diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index f780787a..cf00dcca 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -8,6 +8,7 @@ set(rocsparse_source src/status.cpp src/rocsparse_auxiliary.cpp src/level1/rocsparse_axpyi.cpp + src/level2/rocsparse_coomv.cpp src/level2/rocsparse_csrmv.cpp src/level2/rocsparse_hybmv.cpp src/conversion/rocsparse_csr2hyb.cpp diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h new file mode 100644 index 00000000..9e05e0f1 --- /dev/null +++ b/library/src/level2/coomv_device.h @@ -0,0 +1,197 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef COOMV_DEVICE_H +#define COOMV_DEVICE_H + +#include + +// Implementation motivated by papers 'Efficient Sparse Matrix-Vector Multiplication on CUDA', +// 'Implementing Sparse Matrix-Vector Multiplication on Throughput-Oriented Processors' and +// 'Segmented operations for sparse matrix computation on vector multiprocessors' +template +static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, + rocsparse_int loops, + T alpha, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* coo_val, + const T* x, + T* y, + rocsparse_int* row_block_red, + T* val_block_red) +{ + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int tid = hipThreadIdx_x; + + // Lane index (0,...,WARPSIZE) + rocsparse_int laneid = gid % WARPSIZE; + // Warp index + rocsparse_int warpid = gid / WARPSIZE; + + // Global COO array index start for current warp + rocsparse_int offset = warpid * loops * WARPSIZE; + + // Quick return when thread is out of bounds + if(offset + laneid >= nnz) + { + return; + } + + rocsparse_int row; + T val; + + // Shared memory to hold row indices and values for segmented reduction + __shared__ rocsparse_int shared_row[BLOCKSIZE]; + __shared__ T shared_val[BLOCKSIZE]; + + // Current threads index into COO structure + rocsparse_int idx = offset + laneid; + + // Each thread processes 'loop' COO entries + while(idx < offset + loops * WARPSIZE) + { + // Get corresponding COO entry, if not out of bounds. + // This can happen when processing more than 1 entry if + // nnz % WARPSIZE != 0 + if(idx < nnz) + { + row = coo_row_ind[idx]; + val = alpha * coo_val[idx] * x[coo_col_ind[idx]]; + } + else + { + row = -1; + val = static_cast(0); + } + + // First thread in warp checks row index from previous loop + // if it has been completed or if additional rows have to be + // appended. + if(idx > offset && laneid == 0) + { + rocsparse_int prevrow = shared_row[tid + WARPSIZE - 1]; + if(row == prevrow) + { + val += shared_val[tid + WARPSIZE - 1]; + } + else if(prevrow >= 0) + { + y[prevrow] += shared_val[tid + WARPSIZE - 1]; + } + } + + __syncthreads(); + + // Update shared buffers + shared_row[tid] = row; + shared_val[tid] = val; + + __syncthreads(); + +#pragma unroll + // Segmented warp reduction + for(rocsparse_int j = 1; j < WARPSIZE; j <<= 1) + { + if(laneid >= j) + { + if(row == shared_row[tid - j]) + { + val += shared_val[tid - j]; + } + } + __syncthreads(); + + shared_val[tid] = val; + + __syncthreads(); + } + + // All lanes but the last one write their result in y. + // The last value might need to be appended by the next iteration. + if(laneid < WARPSIZE - 1) + { + if(row != shared_row[tid + 1] && row >= 0) + { + y[row] += val; + } + } + + // Keep going for the next iteration + idx += WARPSIZE; + } + + // Write last entries into buffers for segmented block reduction + if(laneid == WARPSIZE - 1) + { + row_block_red[warpid] = row; + val_block_red[warpid] = val; + } +} + +template +static __device__ void segmented_blockreduce(const rocsparse_int* rows, T* vals) +{ + rocsparse_int tid = hipThreadIdx_x; +#pragma unroll + for(rocsparse_int j = 1; j < BLOCKSIZE; j <<= 1) + { + T val = static_cast(0); + if(tid >= j) + { + if(rows[tid] == rows[tid - j]) + { + val = vals[tid - j]; + } + } + __syncthreads(); + + vals[tid] += val; + __syncthreads(); + } +} + +template +static __device__ void coomvn_general_block_reduce(rocsparse_int nnz, + const rocsparse_int* row_block_red, + const T* val_block_red, + T* y) +{ + rocsparse_int tid = hipThreadIdx_x; + + // Quick return when thread is out of bounds + if(tid >= nnz) + { + return; + } + + // Shared memory to hold row indices and values for segmented reduction + __shared__ rocsparse_int shared_row[BLOCKSIZE]; + __shared__ T shared_val[BLOCKSIZE]; + + // Loop over blocks that are subject for segmented reduction + for(rocsparse_int i = tid; i < nnz; i += BLOCKSIZE) + { + // Copy data to reduction buffers + shared_row[tid] = row_block_red[i]; + shared_val[tid] = val_block_red[i]; + + __syncthreads(); + + // Do segmented block reduction + segmented_blockreduce(shared_row, shared_val); + + // Add reduced sum to y if valid + rocsparse_int row = shared_row[tid]; + if(row != shared_row[tid + 1] && row >= 0) + { + y[row] += shared_val[tid]; + } + + __syncthreads(); + } +} + +#endif // COOMV_DEVICE_H diff --git a/library/src/level2/rocsparse_coomv.cpp b/library/src/level2/rocsparse_coomv.cpp new file mode 100644 index 00000000..d764a9a9 --- /dev/null +++ b/library/src/level2/rocsparse_coomv.cpp @@ -0,0 +1,362 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "coomv_device.h" + +#include + +template +__global__ void coomvn_warp_host_pointer(rocsparse_int nnz, + rocsparse_int loops, + T alpha, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* coo_val, + const T* x, + T* y, + rocsparse_int* row_block_red, + T* val_block_red) +{ + coomvn_general_warp_reduce( + nnz, loops, alpha, coo_row_ind, coo_col_ind, coo_val, x, y, row_block_red, val_block_red); +} + +template +__global__ void coomvn_warp_device_pointer(rocsparse_int nnz, + rocsparse_int loops, + T* alpha, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* coo_val, + const T* x, + T* y, + rocsparse_int* row_block_red, + T* val_block_red) +{ + coomvn_general_warp_reduce( + nnz, loops, *alpha, coo_row_ind, coo_col_ind, coo_val, x, y, row_block_red, val_block_red); +} + +template +__global__ void coomvn_block_reduce(rocsparse_int nnz, + const rocsparse_int* row_block_red, + const T* val_block_red, + T* y) +{ + coomvn_general_block_reduce(nnz, row_block_red, val_block_red, y); +} + +/*! \brief SPARSE Level 2 API + + \details + coomv multiplies the dense vector x[i] with scalar alpha and sparse m x n + matrix A that is defined in COO storage format and add the result to y[i] + that is multiplied by beta, for i = 1 , … , n + + y := alpha * op(A) * x + beta * y, + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + trans operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + alpha scalar alpha. + @param[in] + descr descriptor of A. + @param[in] + coo_val array of nnz elements of A. + @param[in] + coo_row_ind array of nnz elements containing the row indices of A. + @param[in] + coo_col_ind array of nnz elements containing the column indices of A. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ +template +rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* x, + const T* beta, + T* y) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xcoomv"), + trans, + m, + n, + nnz, + *alpha, + (const void*&)descr, + (const void*&)coo_val, + (const void*&)coo_row_ind, + (const void*&)coo_col_ind, + (const void*&)x, + *beta, + (const void*&)y); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xcoomv"), + trans, + m, + n, + nnz, + (const void*&)alpha, + (const void*&)descr, + (const void*&)coo_val, + (const void*&)coo_row_ind, + (const void*&)coo_col_ind, + (const void*&)x, + (const void*&)beta, + (const void*&)y); + } + + // Check matrix type + if(descr->base != rocsparse_index_base_zero) + { + // TODO + return rocsparse_status_not_implemented; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(coo_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(coo_row_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(coo_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(beta == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // Run different coomv kernels + if(trans == rocsparse_operation_none) + { +#define COOMVN_DIM 128 + rocsparse_int maxthreads = handle->properties.maxThreadsPerBlock; + rocsparse_int nprocs = handle->properties.multiProcessorCount; + rocsparse_int maxblocks = (nprocs * maxthreads - 1) / COOMVN_DIM + 1; + rocsparse_int minblocks = (nnz - 1) / COOMVN_DIM + 1; + + rocsparse_int nblocks = maxblocks < minblocks ? maxblocks : minblocks; + rocsparse_int nwarps = nblocks * (COOMVN_DIM / handle->warp_size); + rocsparse_int nloops = (nnz / handle->warp_size + 1) / nwarps + 1; + + dim3 coomvn_blocks(nblocks); + dim3 coomvn_threads(COOMVN_DIM); + + rocsparse_int* row_block_red = NULL; + T* val_block_red = NULL; + + RETURN_IF_HIP_ERROR(hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwarps)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwarps)); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + if(handle->warp_size == 32) + { + } + else if(handle->warp_size == 64) + { + } + else + { + return rocsparse_status_arch_mismatch; + } + } + else + { + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + + // If beta == 0.0 we need to set y to 0 + if(*beta == 0.0) + { + RETURN_IF_HIP_ERROR(hipMemset(y, 0, sizeof(T) * m)); + } + else if(*beta != 1.0) + { + // Scale y by beta + // scale y TODO + } + + if(handle->warp_size == 32) + { + } + else if(handle->warp_size == 64) + { + hipLaunchKernelGGL((coomvn_warp_host_pointer), + coomvn_blocks, + coomvn_threads, + 0, + stream, + nnz, + nloops, + *alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red); + } + else + { + return rocsparse_status_arch_mismatch; + } + } + + hipLaunchKernelGGL((coomvn_block_reduce), + dim3(1), + coomvn_threads, + 0, + stream, + nwarps, + row_block_red, + val_block_red, + y); + + RETURN_IF_HIP_ERROR(hipFree(row_block_red)); + RETURN_IF_HIP_ERROR(hipFree(val_block_red)); +#undef COOMVN_DIM + } + else + { + // TODO + return rocsparse_status_not_implemented; + } + return rocsparse_status_success; +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_scoomv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const float* x, + const float* beta, + float* y) +{ + return rocsparse_coomv_template( + handle, trans, m, n, nnz, alpha, descr, coo_val, coo_row_ind, coo_col_ind, x, beta, y); +} + +extern "C" rocsparse_status rocsparse_dcoomv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const double* x, + const double* beta, + double* y) +{ + return rocsparse_coomv_template( + handle, trans, m, n, nnz, alpha, descr, coo_val, coo_row_ind, coo_col_ind, x, beta, y); +} From 458438876306742a073a755ffcff56320061cf26 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 23 May 2018 19:20:56 +0200 Subject: [PATCH 070/304] coomv added to rocsparse.h header --- library/include/rocsparse-functions.h | 101 +++++++++++++++++++++++++ library/src/level2/rocsparse_coomv.cpp | 4 + 2 files changed, 105 insertions(+) diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index f142afe0..bc370baf 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -95,6 +95,107 @@ rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, * =========================================================================== */ +/*! \brief SPARSE Level 2 API + + \details + coomv multiplies the dense vector x with scalar alpha and sparse m x n + matrix A that is defined in COO storage format and adds the result to the + dense vector y that is multiplied by beta + + y := alpha * op(A) * x + beta * y + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + trans operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + alpha scalar alpha. + @param[in] + descr descriptor of A. + @param[in] + coo_val array of nnz elements of A. + @param[in] + coo_row_ind array of nnz elements containing the row indices of A. + @param[in] + coo_col_ind array of nnz elements containing the column indices of A. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scoomv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const float* x, + const float* beta, + float* y); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcoomv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const double* x, + const double* beta, + double* y); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_ccoomv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_float_complex* alpha, + const rocsparse_mat_descr descr, + const rocsparse_float_complex* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const rocsparse_float_complex* x, + const rocsparse_float_complex* beta, + rocsparse_float_complex* y); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zcoomv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_double_complex* alpha, + const rocsparse_mat_descr descr, + const rocsparse_double_complex* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const rocsparse_double_complex* x, + const rocsparse_double_complex* beta, + rocsparse_double_complex* y); +*/ + /*! \brief SPARSE Level 2 API \details diff --git a/library/src/level2/rocsparse_coomv.cpp b/library/src/level2/rocsparse_coomv.cpp index d764a9a9..44f6c686 100644 --- a/library/src/level2/rocsparse_coomv.cpp +++ b/library/src/level2/rocsparse_coomv.cpp @@ -243,9 +243,11 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, { if(handle->warp_size == 32) { + return rocsparse_status_not_implemented; } else if(handle->warp_size == 64) { + return rocsparse_status_not_implemented; } else { @@ -268,10 +270,12 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, { // Scale y by beta // scale y TODO + return rocsparse_status_not_implemented; } if(handle->warp_size == 32) { + return rocsparse_status_not_implemented; } else if(handle->warp_size == 64) { From c60192e273cbf491b02e7c874b339835ebdc872e Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 11:47:41 +0200 Subject: [PATCH 071/304] tests: fixed naming --- clients/tests/test_coo2csr.cpp | 13 +++++++------ clients/tests/test_csr2coo.cpp | 13 +++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/clients/tests/test_coo2csr.cpp b/clients/tests/test_coo2csr.cpp index 1ba84553..f1df1c47 100644 --- a/clients/tests/test_coo2csr.cpp +++ b/clients/tests/test_coo2csr.cpp @@ -11,10 +11,11 @@ typedef std::tuple coo2csr_tuple; -int coo_M_range[] = {-1, 0, 10, 500, 872, 1000}; -int coo_N_range[] = {-3, 0, 33, 242, 623, 1000}; +int coo2csr_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int coo2csr_N_range[] = {-3, 0, 33, 242, 623, 1000}; -rocsparse_index_base coo_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +rocsparse_index_base coo2csr_idx_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; class parameterized_coo2csr : public testing::TestWithParam { @@ -47,6 +48,6 @@ TEST_P(parameterized_coo2csr, coo2csr) INSTANTIATE_TEST_CASE_P(coo2csr, parameterized_coo2csr, - testing::Combine(testing::ValuesIn(coo_M_range), - testing::ValuesIn(coo_N_range), - testing::ValuesIn(coo_idx_base_range))); + testing::Combine(testing::ValuesIn(coo2csr_M_range), + testing::ValuesIn(coo2csr_N_range), + testing::ValuesIn(coo2csr_idx_base_range))); diff --git a/clients/tests/test_csr2coo.cpp b/clients/tests/test_csr2coo.cpp index bc68cc3c..04b4d940 100644 --- a/clients/tests/test_csr2coo.cpp +++ b/clients/tests/test_csr2coo.cpp @@ -11,10 +11,11 @@ typedef std::tuple csr2coo_tuple; -int coo_M_range[] = {-1, 0, 10, 500, 872, 1000}; -int coo_N_range[] = {-3, 0, 33, 242, 623, 1000}; +int csr2coo_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int csr2coo_N_range[] = {-3, 0, 33, 242, 623, 1000}; -rocsparse_index_base coo_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +rocsparse_index_base csr2coo_idx_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; class parameterized_csr2coo : public testing::TestWithParam { @@ -47,6 +48,6 @@ TEST_P(parameterized_csr2coo, csr2coo) INSTANTIATE_TEST_CASE_P(csr2coo, parameterized_csr2coo, - testing::Combine(testing::ValuesIn(coo_M_range), - testing::ValuesIn(coo_N_range), - testing::ValuesIn(coo_idx_base_range))); + testing::Combine(testing::ValuesIn(csr2coo_M_range), + testing::ValuesIn(csr2coo_N_range), + testing::ValuesIn(csr2coo_idx_base_range))); From 878d4541696bbdbf51334ddfbb1b58de1a791ca0 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 11:48:13 +0200 Subject: [PATCH 072/304] added row/col sorting to matrix market reader --- clients/include/utility.hpp | 52 ++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index b47bce77..f1a47f60 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -412,9 +413,9 @@ rocsparse_int read_mtx_matrix(const char* filename, sscanf(line, "%d %d %d", &nrow, &ncol, &snnz); nnz = symm ? (snnz - nrow) * 2 + nrow : snnz; - row.resize(nnz); - col.resize(nnz); - val.resize(nnz); + std::vector unsorted_row(nnz); + std::vector unsorted_col(nnz); + std::vector unsorted_val(nnz); // Read entries rocsparse_int idx = 0; @@ -429,22 +430,55 @@ rocsparse_int read_mtx_matrix(const char* filename, --irow; --icol; - row[idx] = irow; - col[idx] = icol; - val[idx] = (T)dval; + unsorted_row[idx] = irow; + unsorted_col[idx] = icol; + unsorted_val[idx] = (T)dval; ++idx; if(symm && irow != icol) { - row[idx] = icol; - col[idx] = irow; - val[idx] = (T)dval; + unsorted_row[idx] = icol; + unsorted_col[idx] = irow; + unsorted_val[idx] = (T)dval; ++idx; } } fclose(f); + row.resize(nnz); + col.resize(nnz); + val.resize(nnz); + + // Sort by row and column index + std::vector perm(nnz); + for(rocsparse_int i = 0; i < nnz; ++i) + { + perm[i] = i; + } + + std::sort(perm.begin(), perm.end(), [&](const int& a, const int& b) { + if(unsorted_row[a] < unsorted_row[b]) + { + return true; + } + else if(unsorted_row[a] == unsorted_row[b]) + { + return (unsorted_col[a] < unsorted_col[b]); + } + else + { + return false; + } + }); + + for(rocsparse_int i = 0; i < nnz; ++i) + { + row[i] = unsorted_row[perm[i]]; + col[i] = unsorted_col[perm[i]]; + val[i] = unsorted_val[perm[i]]; + } + return 0; } From 08fee14d29d0eed50ea7278b0442e92a4b6e1bba Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 11:57:36 +0200 Subject: [PATCH 073/304] coomv: fixed uninitialized values on device --- library/src/level2/coomv_device.h | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h index 9e05e0f1..25fd7926 100644 --- a/library/src/level2/coomv_device.h +++ b/library/src/level2/coomv_device.h @@ -8,6 +8,19 @@ #include +template +__global__ void coomv_scale(rocsparse_int size, T scalar, T* data) +{ + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(gid >= size) + { + return; + } + + data[gid] *= scalar; +} + // Implementation motivated by papers 'Efficient Sparse Matrix-Vector Multiplication on CUDA', // 'Implementing Sparse Matrix-Vector Multiplication on Throughput-Oriented Processors' and // 'Segmented operations for sparse matrix computation on vector multiprocessors' @@ -31,6 +44,13 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, // Warp index rocsparse_int warpid = gid / WARPSIZE; + // Initialize block buffers + if(laneid == 0) + { + row_block_red[warpid] = -1; + val_block_red[warpid] = static_cast(0); + } + // Global COO array index start for current warp rocsparse_int offset = warpid * loops * WARPSIZE; @@ -154,10 +174,10 @@ static __device__ void segmented_blockreduce(const rocsparse_int* rows, T* vals) } template -static __device__ void coomvn_general_block_reduce(rocsparse_int nnz, - const rocsparse_int* row_block_red, - const T* val_block_red, - T* y) +__global__ void coomvn_general_block_reduce(rocsparse_int nnz, + const rocsparse_int* row_block_red, + const T* val_block_red, + T* y) { rocsparse_int tid = hipThreadIdx_x; From 5a6589a9a9edd28b2a34cb06742b5965d4ed5147 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 11:57:55 +0200 Subject: [PATCH 074/304] added support for other warpsizes and arbitrary beta --- library/src/level2/rocsparse_coomv.cpp | 96 +++++++++++++++++++++----- 1 file changed, 77 insertions(+), 19 deletions(-) diff --git a/library/src/level2/rocsparse_coomv.cpp b/library/src/level2/rocsparse_coomv.cpp index 44f6c686..43e03231 100644 --- a/library/src/level2/rocsparse_coomv.cpp +++ b/library/src/level2/rocsparse_coomv.cpp @@ -42,15 +42,6 @@ __global__ void coomvn_warp_device_pointer(rocsparse_int nnz, nnz, loops, *alpha, coo_row_ind, coo_col_ind, coo_val, x, y, row_block_red, val_block_red); } -template -__global__ void coomvn_block_reduce(rocsparse_int nnz, - const rocsparse_int* row_block_red, - const T* val_block_red, - T* y) -{ - coomvn_general_block_reduce(nnz, row_block_red, val_block_red, y); -} - /*! \brief SPARSE Level 2 API \details @@ -241,13 +232,61 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, if(handle->pointer_mode == rocsparse_pointer_mode_device) { + // We need a host copy of beta to avoid unneccessary kernel launch + T h_beta; + RETURN_IF_HIP_ERROR(hipMemcpy(&h_beta, beta, sizeof(T), hipMemcpyDeviceToHost)); + + if(h_beta == static_cast(0)) + { + RETURN_IF_HIP_ERROR(hipMemset(y, 0, sizeof(T) * m)); + } + else if(h_beta != static_cast(1)) + { + hipLaunchKernelGGL((coomv_scale), + dim3((m - 1) / COOMVN_DIM + 1), + coomvn_threads, + 0, + stream, + m, + h_beta, + y); + } + if(handle->warp_size == 32) { - return rocsparse_status_not_implemented; + hipLaunchKernelGGL((coomvn_warp_device_pointer), + coomvn_blocks, + coomvn_threads, + 0, + stream, + nnz, + nloops, + alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red); } else if(handle->warp_size == 64) { - return rocsparse_status_not_implemented; + hipLaunchKernelGGL((coomvn_warp_device_pointer), + coomvn_blocks, + coomvn_threads, + 0, + stream, + nnz, + nloops, + alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red); } else { @@ -256,26 +295,45 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, } else { - if(*alpha == 0.0 && *beta == 1.0) + if(*alpha == static_cast(0) && *beta == static_cast(1)) { return rocsparse_status_success; } // If beta == 0.0 we need to set y to 0 - if(*beta == 0.0) + if(*beta == static_cast(0)) { RETURN_IF_HIP_ERROR(hipMemset(y, 0, sizeof(T) * m)); } - else if(*beta != 1.0) + else if(*beta != static_cast(1)) { - // Scale y by beta - // scale y TODO - return rocsparse_status_not_implemented; + hipLaunchKernelGGL((coomv_scale), + dim3((m - 1) / COOMVN_DIM + 1), + coomvn_threads, + 0, + stream, + m, + *beta, + y); } if(handle->warp_size == 32) { - return rocsparse_status_not_implemented; + hipLaunchKernelGGL((coomvn_warp_host_pointer), + coomvn_blocks, + coomvn_threads, + 0, + stream, + nnz, + nloops, + *alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red); } else if(handle->warp_size == 64) { @@ -301,7 +359,7 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, } } - hipLaunchKernelGGL((coomvn_block_reduce), + hipLaunchKernelGGL((coomvn_general_block_reduce), dim3(1), coomvn_threads, 0, From 22c1a88ae3255a22dc54cd00b3c2015b9d89987d Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 11:58:33 +0200 Subject: [PATCH 075/304] coomv: tests, benchmark and example --- clients/benchmarks/client.cpp | 10 +- .../rocsparse_template_specialization.cpp | 38 ++ clients/include/rocsparse.hpp | 14 + clients/include/testing_coomv.hpp | 383 ++++++++++++++++++ clients/samples/CMakeLists.txt | 1 + clients/samples/example_coomv.cpp | 172 ++++++++ clients/tests/CMakeLists.txt | 1 + clients/tests/test_coomv.cpp | 68 ++++ 8 files changed, 686 insertions(+), 1 deletion(-) create mode 100644 clients/include/testing_coomv.hpp create mode 100644 clients/samples/example_coomv.cpp create mode 100644 clients/tests/test_coomv.cpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index cf1ffcdc..648cfea5 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -4,6 +4,7 @@ #include "utility.hpp" #include "rocsparse.hpp" +#include "testing_coomv.hpp" #include "testing_csrmv.hpp" #include "testing_axpyi.hpp" #include "testing_csr2coo.hpp" @@ -62,7 +63,7 @@ int main(int argc, char* argv[]) ("function,f", po::value(&function)->default_value("axpyi"), - "SPARSE function to test. Options: axpyi, csrmv, csr2coo, coo2csr") + "SPARSE function to test. Options: axpyi, coomv, csrmv, csr2coo, coo2csr") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") @@ -124,6 +125,13 @@ int main(int argc, char* argv[]) else if(precision == 'd') testing_axpyi(argus); } + else if(function == "coomv") + { + if(precision == 's') + testing_coomv(argus); + else if(precision == 'd') + testing_coomv(argus); + } else if(function == "csrmv") { if(precision == 's') diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 9ea625c0..386176c1 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -32,6 +32,44 @@ rocsparse_status rocsparse_axpyi(rocsparse_handle handle, return rocsparse_daxpyi(handle, nnz, alpha, x_val, x_ind, y, idx_base); } +template <> +rocsparse_status rocsparse_coomv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const float* x, + const float* beta, + float* y) +{ + return rocsparse_scoomv( + handle, trans, m, n, nnz, alpha, descr, coo_val, coo_row_ind, coo_col_ind, x, beta, y); +} + +template <> +rocsparse_status rocsparse_coomv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const double* x, + const double* beta, + double* y) +{ + return rocsparse_dcoomv( + handle, trans, m, n, nnz, alpha, descr, coo_val, coo_row_ind, coo_col_ind, x, beta, y); +} + template <> rocsparse_status rocsparse_csrmv(rocsparse_handle handle, rocsparse_operation trans, diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index 2e59a9d8..7157c00a 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -20,6 +20,20 @@ rocsparse_status rocsparse_axpyi(rocsparse_handle handle, rocsparse_index_base idx_base); template +rocsparse_status rocsparse_coomv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* x, + const T* beta, + T* y); +template rocsparse_status rocsparse_csrmv(rocsparse_handle handle, rocsparse_operation trans, rocsparse_int m, diff --git a/clients/include/testing_coomv.hpp b/clients/include/testing_coomv.hpp new file mode 100644 index 00000000..7d7d7c6a --- /dev/null +++ b/clients/include/testing_coomv.hpp @@ -0,0 +1,383 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_COOMV_HPP +#define TESTING_COOMV_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_coomv_bad_arg(void) +{ + rocsparse_int n = 100; + rocsparse_int m = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + T alpha = 0.6; + T beta = 0.2; + rocsparse_operation trans = rocsparse_operation_none; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + auto drow_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* drow = (rocsparse_int*)drow_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dval || !drow || !dcol || !dx || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for(nullptr == drow) + { + rocsparse_int* drow_null = nullptr; + + status = rocsparse_coomv( + handle, trans, m, n, nnz, &alpha, descr, dval, drow_null, dcol, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: drow is nullptr"); + } + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_coomv( + handle, trans, m, n, nnz, &alpha, descr, dval, drow, dcol_null, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dval) + { + T* dval_null = nullptr; + + status = rocsparse_coomv( + handle, trans, m, n, nnz, &alpha, descr, dval_null, drow, dcol, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); + } + // testing for(nullptr == dx) + { + T* dx_null = nullptr; + + status = rocsparse_coomv( + handle, trans, m, n, nnz, &alpha, descr, dval, drow, dcol, dx_null, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); + } + // testing for(nullptr == dy) + { + T* dy_null = nullptr; + + status = rocsparse_coomv( + handle, trans, m, n, nnz, &alpha, descr, dval, drow, dcol, dx, &beta, dy_null); + verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); + } + // testing for(nullptr == d_alpha) + { + T* d_alpha_null = nullptr; + + status = rocsparse_coomv( + handle, trans, m, n, nnz, d_alpha_null, descr, dval, drow, dcol, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); + } + // testing for(nullptr == d_beta) + { + T* d_beta_null = nullptr; + + status = rocsparse_coomv( + handle, trans, m, n, nnz, &alpha, descr, dval, drow, dcol, dx, d_beta_null, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_coomv( + handle, trans, m, n, nnz, &alpha, descr_null, dval, drow, dcol, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_coomv( + handle_null, trans, m, n, nnz, &alpha, descr, dval, drow, dcol, dx, &beta, dy); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_coomv(Arguments argus) +{ + rocsparse_int safe_size = 100; + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + T h_alpha = argus.alpha; + T h_beta = argus.beta; + rocsparse_operation trans = argus.trans; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + std::unique_ptr test_descr(new descr_struct); + rocsparse_mat_descr descr = test_descr->descr; + + // Determine number of non-zero elements + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto drow_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* drow = (rocsparse_int*)drow_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dval || !drow || !dcol || !dx || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!drow || !dcol || !dval || !dx || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_coomv( + handle, trans, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hptr; + std::vector hrow; + std::vector hcol; + std::vector hval; + + // Initial Data on CPU + srand(12345ULL); + if(argus.laplacian) + { + m = n = gen_2d_laplacian(argus.laplacian, hptr, hcol, hval, idx_base); + nnz = hptr[m]; + hrow.resize(nnz); + + // Convert CSR to COO + for(rocsparse_int i = 0; i < m; ++i) + { + for(rocsparse_int j = hptr[i]; j < hptr[i + 1]; ++j) + { + hrow[j - idx_base] = i + idx_base; + } + } + } + else + { + if(argus.filename != "") + { + if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hrow, hcol, hval) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hrow, hcol, hval, idx_base); + } + } + + std::vector hx(n); + std::vector hy_1(m); + std::vector hy_2(m); + std::vector hy_gold(m); + + rocsparse_init(hx, 1, n); + rocsparse_init(hy_1, 1, m); + + // copy vector is easy in STL; hy_gold = hx: save a copy in hy_gold which will be output of CPU + hy_2 = hy_1; + hy_gold = hy_1; + + // allocate memory on device + auto drow_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * n), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto d_beta_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + + rocsparse_int* drow = (rocsparse_int*)drow_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy_1 = (T*)dy_1_managed.get(); + T* dy_2 = (T*)dy_2_managed.get(); + T* d_alpha = (T*)d_alpha_managed.get(); + T* d_beta = (T*)d_beta_managed.get(); + + if(!dval || !drow || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dval || !drow || !dcol || !dx || " + "!dy_1 || !dy_2 || !d_alpha || !d_beta"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR( + hipMemcpy(drow, hrow.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dcol, hcol.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dval, hval.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * n, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_coomv( + handle, trans, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy_1)); + + // ROCSPARSE pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_coomv( + handle, trans, m, n, nnz, d_alpha, descr, dval, drow, dcol, dx, d_beta, dy_2)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * m, hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + for(rocsparse_int i = 0; i < m; ++i) + { + hy_gold[i] *= h_beta; + } + + for(rocsparse_int i = 0; i < nnz; ++i) + { + hy_gold[hrow[i]] += h_alpha * hval[i] * hx[hcol[i]]; + } + + cpu_time_used = get_time_us() - cpu_time_used; + + // enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + if(argus.unit_check) + { + unit_check_general(1, m, hy_gold.data(), hy_1.data()); + unit_check_general(1, m, hy_gold.data(), hy_2.data()); + } + } + + if(argus.timing) + { + int number_cold_calls = 10; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_coomv( + handle, trans, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy_1); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_coomv( + handle, trans, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy_1); + } + + // Convert to miliseconds per call + gpu_time_used = get_time_us() - gpu_time_used; + gpu_time_used = gpu_time_used / (number_hot_calls * 1e3); + size_t flops = (h_alpha != 1.0) ? 3.0 * nnz : 2.0 * nnz; + flops = (h_beta != 0.0) ? flops + m : flops; + double gpu_gflops = flops / gpu_time_used / 1e6; + size_t memtrans = 3 * m + nnz; + if(h_beta == 0.0) + { + memtrans += m; + } + else if(h_beta != 1.0) + { + memtrans += 2 * m; + } + double bandwidth = + (memtrans * sizeof(T) + (2 * nnz) * sizeof(rocsparse_int)) / gpu_time_used / 1e6; + + printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + m, + n, + nnz, + h_alpha, + h_beta, + gpu_gflops, + bandwidth, + gpu_time_used); + } + return rocsparse_status_success; +} + +#endif // TESTING_COOMV_HPP diff --git a/clients/samples/CMakeLists.txt b/clients/samples/CMakeLists.txt index 9cdbb3e9..8a92e5f8 100644 --- a/clients/samples/CMakeLists.txt +++ b/clients/samples/CMakeLists.txt @@ -44,5 +44,6 @@ endfunction() # Examples add_rocsparse_example(example_handle.cpp) +add_rocsparse_example(example_coomv.cpp) add_rocsparse_example(example_csrmv.cpp) add_rocsparse_example(example_ellmv.cpp) diff --git a/clients/samples/example_coomv.cpp b/clients/samples/example_coomv.cpp new file mode 100644 index 00000000..25e89791 --- /dev/null +++ b/clients/samples/example_coomv.cpp @@ -0,0 +1,172 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "utility.hpp" + +#include +#include +#include +#include + +int main(int argc, char* argv[]) +{ + // Parse command line + if(argc < 2) + { + fprintf(stderr, "%s [ ]\n", argv[0]); + return -1; + } + + int ndim = atoi(argv[1]); + int trials = 200; + int batch_size = 1; + + if(argc > 2) + { + trials = atoi(argv[2]); + } + if(argc > 3) + { + batch_size = atoi(argv[3]); + } + + // rocSPARSE handle + rocsparse_handle handle; + rocsparse_create_handle(&handle); + + hipDeviceProp_t devProp; + int device_id = 0; + + hipGetDevice(&device_id); + hipGetDeviceProperties(&devProp, device_id); + printf("Device: %s\n", devProp.name); + + // Generate problem + std::vector hAptr; + std::vector hAcol; + std::vector hAval; + int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); + int n = m; + int nnz = hAptr[m]; + + // Convert to COO matrix + std::vector hArow(nnz); + + for(int i = 0; i < m; ++i) + { + for(int j = hAptr[i]; j < hAptr[i + 1]; ++j) + { + hArow[j] = i; + } + } + + // Sample some random data + srand(12345ULL); + + double halpha = static_cast(rand()) / RAND_MAX; + double hbeta = 0.0; + + std::vector hx(m); + rocsparse_init(hx, 1, m); + + // Matrix descriptor + rocsparse_mat_descr descrA; + rocsparse_create_mat_descr(&descrA); + + // Offload data to device + int* dArow = NULL; + int* dAcol = NULL; + double* dAval = NULL; + double* dx = NULL; + double* dy = NULL; + + hipMalloc((void**)&dArow, sizeof(int) * nnz); + hipMalloc((void**)&dAcol, sizeof(int) * nnz); + hipMalloc((void**)&dAval, sizeof(double) * nnz); + hipMalloc((void**)&dx, sizeof(double) * m); + hipMalloc((void**)&dy, sizeof(double) * m); + + hipMemcpy(dArow, hArow.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double) * m, hipMemcpyHostToDevice); + + // Warm up + for(int i = 0; i < 10; ++i) + { + // Call rocsparse coomv + rocsparse_dcoomv(handle, + rocsparse_operation_none, + m, + n, + nnz, + &halpha, + descrA, + dAval, + dArow, + dAcol, + dx, + &hbeta, + dy); + } + + // Device synchronization + hipDeviceSynchronize(); + + // Start time measurement + double time = get_time_us(); + + // COO matrix vector multiplication + for(int i = 0; i < trials; ++i) + { + for(int i = 0; i < batch_size; ++i) + { + // Call rocsparse coomv + rocsparse_dcoomv(handle, + rocsparse_operation_none, + m, + n, + nnz, + &halpha, + descrA, + dAval, + dArow, + dAcol, + dx, + &hbeta, + dy); + } + + // Device synchronization + hipDeviceSynchronize(); + } + + time = (get_time_us() - time) / (trials * batch_size * 1e3); + double bandwidth = + static_cast(sizeof(double) * (4 * m + nnz) + sizeof(rocsparse_int) * (2 * nnz)) / + time / 1e6; + double gflops = static_cast(3 * nnz) / time / 1e6; + printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tusec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + m, + n, + nnz, + halpha, + hbeta, + gflops, + bandwidth, + time); + + // Clear up on device + hipFree(dArow); + hipFree(dAcol); + hipFree(dAval); + hipFree(dx); + hipFree(dy); + + rocsparse_destroy_mat_descr(descrA); + rocsparse_destroy_handle(handle); + + return 0; +} diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 3b56fb20..c0a0e811 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -7,6 +7,7 @@ find_package(GTest REQUIRED) set(ROCSPARSE_TEST_SOURCES rocsparse_gtest_main.cpp test_axpyi.cpp + test_coomv.cpp test_csrmv.cpp test_csr2coo.cpp test_coo2csr.cpp diff --git a/clients/tests/test_coomv.cpp b/clients/tests/test_coomv.cpp new file mode 100644 index 00000000..37549851 --- /dev/null +++ b/clients/tests/test_coomv.cpp @@ -0,0 +1,68 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_coomv.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple coomv_tuple; + +int coo_M_range[] = {-1, 0, 10, 500, 7111, 10000}; +int coo_N_range[] = {-3, 0, 33, 842, 4441, 10000}; + +std::vector coo_alpha_range = {2.0, 3.0}; +std::vector coo_beta_range = {0.0, 0.67, 1.0}; + +base coo_idxbase_range[] = {rocsparse_index_base_zero}; + +class parameterized_coomv : public testing::TestWithParam +{ + protected: + parameterized_coomv() {} + virtual ~parameterized_coomv() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_coomv_arguments(coomv_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.beta = std::get<3>(tup); + arg.idx_base = std::get<4>(tup); + arg.timing = 0; + return arg; +} + +TEST(coomv_bad_arg, coomv_float) { testing_coomv_bad_arg(); } + +TEST_P(parameterized_coomv, coomv_float) +{ + Arguments arg = setup_coomv_arguments(GetParam()); + + rocsparse_status status = testing_coomv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_coomv, coomv_double) +{ + Arguments arg = setup_coomv_arguments(GetParam()); + + rocsparse_status status = testing_coomv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(coomv, + parameterized_coomv, + testing::Combine(testing::ValuesIn(coo_M_range), + testing::ValuesIn(coo_N_range), + testing::ValuesIn(coo_alpha_range), + testing::ValuesIn(coo_beta_range), + testing::ValuesIn(coo_idxbase_range))); From ceed3aba1b28f4818741fc9e430a5c1e4be3e7f8 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 12:07:05 +0200 Subject: [PATCH 076/304] jenkinsfile update --- Jenkinsfile | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3ffc7a82..f008822d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -21,6 +21,27 @@ properties([ import java.nio.file.Path; //////////////////////////////////////////////////////////////////////// +// Check whether job was started by a timer +@NonCPS +def isJobStartedByTimer() { + def startedByTimer = false + try { + def buildCauses = currentBuild.rawBuild.getCauses() + for ( buildCause in buildCauses ) { + if (buildCause != null) { + def causeDescription = buildCause.getShortDescription() + echo "shortDescription: ${causeDescription}" + if (causeDescription.contains("Started by timer")) { + startedByTimer = true + } + } + } + } catch(theError) { + echo "Error getting build cause" + } + + return startedByTimer +} //////////////////////////////////////////////////////////////////////// // Return build number of upstream job @@ -181,12 +202,24 @@ def docker_build_inside_image( def build_image, compiler_data compiler_args, doc // Cap the maximum amount of testing to be a few hours; assume failure if the time limit is hit timeout(time: 2, unit: 'HOURS') { - sh """#!/usr/bin/env bash - set -x - cd ${paths.project_build_prefix}/build/release/clients/tests - LD_LIBRARY_PATH=/opt/rocm/hcc/lib ./rocsparse-test${build_type_postfix} --gtest_output=xml --gtest_color=yes - """ - junit "${paths.project_build_prefix}/build/release/clients/tests/*.xml" + if(isJobStartedByTimer()) + { + sh """#!/usr/bin/env bash + set -x + cd ${paths.project_build_prefix}/build/release/clients/tests + LD_LIBRARY_PATH=/opt/rocm/hcc/lib ./rocsparse-test${build_type_postfix} --gtest_output=xml --gtest_color=yes #--gtest_filter=*nightly* + """ + junit "${paths.project_build_prefix}/build/release/clients/tests/*.xml" + } + else + { + sh """#!/usr/bin/env bash + set -x + cd ${paths.project_build_prefix}/build/release/clients/tests + LD_LIBRARY_PATH=/opt/rocm/hcc/lib ./rocsparse-test${build_type_postfix} --gtest_output=xml --gtest_color=yes #--gtest_filter=*checkin* + """ + junit "${paths.project_build_prefix}/build/release/clients/tests/*.xml" + } } String docker_context = "${compiler_args.build_config}/${compiler_args.compiler_name}" @@ -247,7 +280,7 @@ def docker_build_inside_image( def build_image, compiler_data compiler_args, doc String docker_test_install( compiler_data compiler_args, docker_data docker_args, project_paths rocsparse_paths, String job_name ) { def rocsparse_install_image = null - String image_name = "rocsparse-hip-${compiler_args.compiler_name}" + String image_name = "rocsparse-hip-${compiler_args.compiler_name}-ubuntu-16.04" String docker_context = "${compiler_args.build_config}/${compiler_args.compiler_name}" stage( "Install ${compiler_args.compiler_name} ${compiler_args.build_config}" ) @@ -423,7 +456,6 @@ parallel hcc_ctu: def print_version_closure = { sh """ set -x - /opt/rocm/bin/rocm_agent_enumerator -t ALL /opt/rocm/bin/hcc --version """ } @@ -461,7 +493,6 @@ rocm_ubuntu: def print_version_closure = { sh """ set -x - /opt/rocm/bin/rocm_agent_enumerator -t ALL /opt/rocm/bin/hcc --version """ } From b65c85cd5083bf987f2cd7ef1bba3a83ce60f0e4 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 12:48:25 +0200 Subject: [PATCH 077/304] coomv: rocsparse_index_base_one support --- clients/include/testing_coomv.hpp | 5 ++- clients/tests/test_coomv.cpp | 2 +- library/src/level2/coomv_device.h | 7 ++-- library/src/level2/rocsparse_coomv.cpp | 52 +++++++++++++++++++------- 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/clients/include/testing_coomv.hpp b/clients/include/testing_coomv.hpp index 7d7d7c6a..5c1b96f6 100644 --- a/clients/include/testing_coomv.hpp +++ b/clients/include/testing_coomv.hpp @@ -147,6 +147,9 @@ rocsparse_status testing_coomv(Arguments argus) std::unique_ptr test_descr(new descr_struct); rocsparse_mat_descr descr = test_descr->descr; + // Set matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); + // Determine number of non-zero elements double scale = 0.02; if(m > 1000 || n > 1000) @@ -314,7 +317,7 @@ rocsparse_status testing_coomv(Arguments argus) for(rocsparse_int i = 0; i < nnz; ++i) { - hy_gold[hrow[i]] += h_alpha * hval[i] * hx[hcol[i]]; + hy_gold[hrow[i] - idx_base] += h_alpha * hval[i] * hx[hcol[i] - idx_base]; } cpu_time_used = get_time_us() - cpu_time_used; diff --git a/clients/tests/test_coomv.cpp b/clients/tests/test_coomv.cpp index 37549851..c604ad2c 100644 --- a/clients/tests/test_coomv.cpp +++ b/clients/tests/test_coomv.cpp @@ -18,7 +18,7 @@ int coo_N_range[] = {-3, 0, 33, 842, 4441, 10000}; std::vector coo_alpha_range = {2.0, 3.0}; std::vector coo_beta_range = {0.0, 0.67, 1.0}; -base coo_idxbase_range[] = {rocsparse_index_base_zero}; +base coo_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; class parameterized_coomv : public testing::TestWithParam { diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h index 25fd7926..b2457c3e 100644 --- a/library/src/level2/coomv_device.h +++ b/library/src/level2/coomv_device.h @@ -34,7 +34,8 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, const T* x, T* y, rocsparse_int* row_block_red, - T* val_block_red) + T* val_block_red, + rocsparse_index_base idx_base) { rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; rocsparse_int tid = hipThreadIdx_x; @@ -78,8 +79,8 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, // nnz % WARPSIZE != 0 if(idx < nnz) { - row = coo_row_ind[idx]; - val = alpha * coo_val[idx] * x[coo_col_ind[idx]]; + row = coo_row_ind[idx] - idx_base; + val = alpha * coo_val[idx] * x[coo_col_ind[idx] - idx_base]; } else { diff --git a/library/src/level2/rocsparse_coomv.cpp b/library/src/level2/rocsparse_coomv.cpp index 43e03231..fc17a1b4 100644 --- a/library/src/level2/rocsparse_coomv.cpp +++ b/library/src/level2/rocsparse_coomv.cpp @@ -20,10 +20,20 @@ __global__ void coomvn_warp_host_pointer(rocsparse_int nnz, const T* x, T* y, rocsparse_int* row_block_red, - T* val_block_red) + T* val_block_red, + rocsparse_index_base idx_base) { - coomvn_general_warp_reduce( - nnz, loops, alpha, coo_row_ind, coo_col_ind, coo_val, x, y, row_block_red, val_block_red); + coomvn_general_warp_reduce(nnz, + loops, + alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + idx_base); } template @@ -36,10 +46,20 @@ __global__ void coomvn_warp_device_pointer(rocsparse_int nnz, const T* x, T* y, rocsparse_int* row_block_red, - T* val_block_red) + T* val_block_red, + rocsparse_index_base idx_base) { - coomvn_general_warp_reduce( - nnz, loops, *alpha, coo_row_ind, coo_col_ind, coo_val, x, y, row_block_red, val_block_red); + coomvn_general_warp_reduce(nnz, + loops, + *alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + idx_base); } /*! \brief SPARSE Level 2 API @@ -143,12 +163,12 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, (const void*&)y); } - // Check matrix type - if(descr->base != rocsparse_index_base_zero) + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) { - // TODO - return rocsparse_status_not_implemented; + return rocsparse_status_invalid_value; } + // Check matrix type if(descr->type != rocsparse_matrix_type_general) { // TODO @@ -268,7 +288,8 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, x, y, row_block_red, - val_block_red); + val_block_red, + descr->base); } else if(handle->warp_size == 64) { @@ -286,7 +307,8 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, x, y, row_block_red, - val_block_red); + val_block_red, + descr->base); } else { @@ -333,7 +355,8 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, x, y, row_block_red, - val_block_red); + val_block_red, + descr->base); } else if(handle->warp_size == 64) { @@ -351,7 +374,8 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, x, y, row_block_red, - val_block_red); + val_block_red, + descr->base); } else { From 8167d16a6b5b7c55dc3b7fbbbf727558bc74401e Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 12:56:47 +0200 Subject: [PATCH 078/304] csrmv: added support for rocsparse_index_base_one --- clients/include/testing_csrmv.hpp | 8 ++- clients/tests/test_csrmv.cpp | 2 +- library/src/level2/csrmv_device.h | 9 +-- library/src/level2/rocsparse_csrmv.cpp | 83 ++++++++++++++++---------- 4 files changed, 65 insertions(+), 37 deletions(-) diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index 1d870966..c51e1f9d 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -147,6 +147,9 @@ rocsparse_status testing_csrmv(Arguments argus) std::unique_ptr test_descr(new descr_struct); rocsparse_mat_descr descr = test_descr->descr; + // Set matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); + // Determine number of non-zero elements double scale = 0.02; if(m > 1000 || n > 1000) @@ -317,9 +320,10 @@ rocsparse_status testing_csrmv(Arguments argus) for(rocsparse_int i = 0; i < m; ++i) { hy_gold[i] *= h_beta; - for(rocsparse_int j = hcsr_row_ptr[i]; j < hcsr_row_ptr[i + 1]; ++j) + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; + ++j) { - hy_gold[i] += h_alpha * hval[j] * hx[hcol_ind[j]]; + hy_gold[i] += h_alpha * hval[j] * hx[hcol_ind[j] - idx_base]; } } diff --git a/clients/tests/test_csrmv.cpp b/clients/tests/test_csrmv.cpp index bf4042ac..ccad138d 100644 --- a/clients/tests/test_csrmv.cpp +++ b/clients/tests/test_csrmv.cpp @@ -18,7 +18,7 @@ int csr_N_range[] = {-3, 0, 33, 842, 4441, 10000}; std::vector csr_alpha_range = {2.0, 3.0}; std::vector csr_beta_range = {0.0, 1.0}; -base csr_idxbase_range[] = {rocsparse_index_base_zero}; +base csr_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; class parameterized_csrmv : public testing::TestWithParam { diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index faf51a3e..79d5626b 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -159,7 +159,8 @@ static __device__ void csrmvn_general_device(int num_rows, const T* val, const T* x, T beta, - T* y) + T* y, + rocsparse_index_base idx_base) { __shared__ volatile T sdata[WG_SIZE + SUBWAVE_SIZE / 2]; @@ -173,8 +174,8 @@ static __device__ void csrmvn_general_device(int num_rows, for(int row = vector_id; row < num_rows; row += num_vectors) { - const int row_start = row_offset[row]; - const int row_end = row_offset[row + 1]; + const int row_start = row_offset[row] - idx_base; + const int row_end = row_offset[row + 1] - idx_base; T sum = 0.; T sumk_e = 0.; @@ -182,7 +183,7 @@ static __device__ void csrmvn_general_device(int num_rows, // It is about 5% faster to always multiply by alpha, rather than to // check whether alpha is 0, 1, or other and do different code paths. for(int j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE) - sum = two_fma(alpha * val[j], x[col[j]], sum, &sumk_e); + sum = two_fma(alpha * val[j], x[col[j] - idx_base], sum, &sumk_e); T new_error = 0.; sum = two_sum(sum, sumk_e, &new_error); diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 1edc288d..411a36cd 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -17,10 +17,11 @@ __global__ void csrmvn_kernel_host_pointer(rocsparse_int m, const T* csr_val, const T* x, T beta, - T* y) + T* y, + rocsparse_index_base idx_base) { csrmvn_general_device( - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y); + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); } template @@ -31,10 +32,11 @@ __global__ void csrmvn_kernel_device_pointer(rocsparse_int m, const T* csr_val, const T* x, const T* beta, - T* y) + T* y, + rocsparse_index_base idx_base) { csrmvn_general_device( - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y); + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); } /*! \brief SPARSE Level 2 API @@ -139,11 +141,10 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, (const void*&)y); } - // Check matrix type - if(descr->base != rocsparse_index_base_zero) + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) { - // TODO - return rocsparse_status_not_implemented; + return rocsparse_status_invalid_value; } if(descr->type != rocsparse_matrix_type_general) { @@ -231,7 +232,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } else if(nnz_per_row < 8) { @@ -247,7 +249,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } else if(nnz_per_row < 16) { @@ -263,7 +266,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } else if(nnz_per_row < 32) { @@ -279,7 +283,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } else { @@ -295,7 +300,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } } else if(handle->warp_size == 64) @@ -314,7 +320,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } else if(nnz_per_row < 8) { @@ -330,7 +337,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } else if(nnz_per_row < 16) { @@ -346,7 +354,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } else if(nnz_per_row < 32) { @@ -362,7 +371,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } else if(nnz_per_row < 64) { @@ -378,7 +388,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } else { @@ -394,7 +405,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, beta, - y); + y, + descr->base); } } else @@ -425,7 +437,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } else if(nnz_per_row < 8) { @@ -441,7 +454,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } else if(nnz_per_row < 16) { @@ -457,7 +471,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } else if(nnz_per_row < 32) { @@ -473,7 +488,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } else { @@ -489,7 +505,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } } else if(handle->warp_size == 64) @@ -508,7 +525,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } else if(nnz_per_row < 8) { @@ -524,7 +542,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } else if(nnz_per_row < 16) { @@ -540,7 +559,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } else if(nnz_per_row < 32) { @@ -556,7 +576,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } else if(nnz_per_row < 64) { @@ -572,7 +593,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } else { @@ -588,7 +610,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, x, *beta, - y); + y, + descr->base); } } else From bfe7f187774e0fa00db0d52ec56c5ae4a5d834b6 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 13:00:33 +0200 Subject: [PATCH 079/304] hybmv: rocsparse_index_base_one support --- library/src/level2/ellmv_device.h | 5 +++-- library/src/level2/rocsparse_hybmv.cpp | 21 ++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/library/src/level2/ellmv_device.h b/library/src/level2/ellmv_device.h index 1a003edd..a96affbe 100644 --- a/library/src/level2/ellmv_device.h +++ b/library/src/level2/ellmv_device.h @@ -15,7 +15,8 @@ static __device__ void ellmvn_device(rocsparse_int m, const T* ell_val, const T* x, T beta, - T* y) + T* y, + rocsparse_index_base idx_base) { rocsparse_int ai = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -28,7 +29,7 @@ static __device__ void ellmvn_device(rocsparse_int m, for(rocsparse_int p = 0; p < ell_width; ++p) { rocsparse_int idx = ELL_IND(ai, p, m, ell_width); - rocsparse_int col = ell_col_ind[idx]; + rocsparse_int col = ell_col_ind[idx] - idx_base; if(col >= 0 && col < n) { diff --git a/library/src/level2/rocsparse_hybmv.cpp b/library/src/level2/rocsparse_hybmv.cpp index 93e3bca5..c8bf4592 100644 --- a/library/src/level2/rocsparse_hybmv.cpp +++ b/library/src/level2/rocsparse_hybmv.cpp @@ -18,9 +18,10 @@ __global__ void ellmvn_kernel_host_pointer(rocsparse_int m, const T* ell_val, const T* x, T beta, - T* y) + T* y, + rocsparse_index_base idx_base) { - ellmvn_device(m, n, ell_width, alpha, ell_col_ind, ell_val, x, beta, y); + ellmvn_device(m, n, ell_width, alpha, ell_col_ind, ell_val, x, beta, y, idx_base); } template @@ -32,9 +33,10 @@ __global__ void ellmvn_kernel_device_pointer(rocsparse_int m, const T* ell_val, const T* x, const T* beta, - T* y) + T* y, + rocsparse_index_base idx_base) { - ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y); + ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y, idx_base); } template @@ -87,12 +89,12 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, (const void*&)y); } - // Check matrix type - if(descr->base != rocsparse_index_base_zero) + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) { - // TODO - return rocsparse_status_not_implemented; + return rocsparse_status_invalid_value; } + // Check matrix type if(descr->type != rocsparse_matrix_type_general) { // TODO @@ -211,7 +213,8 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, (T*)hyb->ell_val, x, *beta, - y); + y, + descr->base); } } #undef ELLMVN_DIM From dd7a56518dc8d9cb499059980e88a76d70a2c81d Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 15:01:16 +0200 Subject: [PATCH 080/304] coomv: fix in kernel argument --- library/src/level2/rocsparse_coomv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/level2/rocsparse_coomv.cpp b/library/src/level2/rocsparse_coomv.cpp index fc17a1b4..ac10e2ad 100644 --- a/library/src/level2/rocsparse_coomv.cpp +++ b/library/src/level2/rocsparse_coomv.cpp @@ -39,7 +39,7 @@ __global__ void coomvn_warp_host_pointer(rocsparse_int nnz, template __global__ void coomvn_warp_device_pointer(rocsparse_int nnz, rocsparse_int loops, - T* alpha, + const T* alpha, const rocsparse_int* coo_row_ind, const rocsparse_int* coo_col_ind, const T* coo_val, From b51cc9376ed9056b993e456cb462ff51fffb2202 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 24 May 2018 15:01:24 +0200 Subject: [PATCH 081/304] fixed compiler warnings --- clients/common/unit.cpp | 6 ------ clients/include/utility.hpp | 6 +++--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index c0a8f11e..ab6c97bf 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -22,10 +22,8 @@ template <> void unit_check_general(rocsparse_int M, rocsparse_int N, float* hCPU, float* hGPU) { -#pragma unroll for(rocsparse_int j = 0; j < N; j++) { -#pragma unroll for(rocsparse_int i = 0; i < M; i++) { #ifdef GOOGLE_TEST @@ -38,10 +36,8 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, float* hCPU, float* hG template <> void unit_check_general(rocsparse_int M, rocsparse_int N, double* hCPU, double* hGPU) { -#pragma unroll for(rocsparse_int j = 0; j < N; j++) { -#pragma unroll for(rocsparse_int i = 0; i < M; i++) { #ifdef GOOGLE_TEST @@ -54,10 +50,8 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, double* hCPU, double* template <> void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int* hCPU, rocsparse_int* hGPU) { -#pragma unroll for(rocsparse_int j = 0; j < N; j++) { -#pragma unroll for(rocsparse_int i = 0; i < M; i++) { #ifdef GOOGLE_TEST diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index f1a47f60..8c99033a 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -229,15 +229,15 @@ void gen_matrix_coo(rocsparse_int m, std::vector& val, rocsparse_index_base idx_base) { - if(row_ind.size() != nnz) + if((rocsparse_int)row_ind.size() != nnz) { row_ind.resize(nnz); } - if(col_ind.size() != nnz) + if((rocsparse_int)col_ind.size() != nnz) { col_ind.resize(nnz); } - if(val.size() != nnz) + if((rocsparse_int)val.size() != nnz) { val.resize(nnz); } From cdb2bf1e093f20db299aec8a81af5e900703b3a7 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 25 May 2018 17:56:29 +0200 Subject: [PATCH 082/304] additional utility stuff for tests --- clients/common/arg_check.cpp | 13 +++++++++++++ clients/include/arg_check.hpp | 2 ++ clients/include/utility.hpp | 4 ++++ 3 files changed, 19 insertions(+) diff --git a/clients/common/arg_check.cpp b/clients/common/arg_check.cpp index fecdcd57..6ae844ad 100644 --- a/clients/common/arg_check.cpp +++ b/clients/common/arg_check.cpp @@ -51,6 +51,19 @@ void verify_rocsparse_status_invalid_size(rocsparse_status status, const char* m #endif } +void verify_rocsparse_status_invalid_value(rocsparse_status status, const char* message) +{ +#ifdef GOOGLE_TEST + ASSERT_EQ(status, rocsparse_status_invalid_value); +#else + if(status != rocsparse_status_invalid_value) + { + std::cerr << "rocSPARSE TEST ERROR: status != rocsparse_status_invalid_value, "; + std::cerr << message << std::endl; + } +#endif +} + void verify_rocsparse_status_invalid_handle(rocsparse_status status) { #ifdef GOOGLE_TEST diff --git a/clients/include/arg_check.hpp b/clients/include/arg_check.hpp index 114d867c..292e7eb7 100644 --- a/clients/include/arg_check.hpp +++ b/clients/include/arg_check.hpp @@ -13,6 +13,8 @@ void verify_rocsparse_status_invalid_pointer(rocsparse_status status, const char void verify_rocsparse_status_invalid_size(rocsparse_status status, const char* message); +void verify_rocsparse_status_invalid_value(rocsparse_status status, const char* message); + void verify_rocsparse_status_invalid_handle(rocsparse_status status); void verify_rocsparse_status_success(rocsparse_status status, const char* message); diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index 8c99033a..7e792d22 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -598,6 +598,7 @@ class Arguments rocsparse_operation trans = rocsparse_operation_none; rocsparse_index_base idx_base = rocsparse_index_base_zero; + rocsparse_hyb_partition part = rocsparse_hyb_partition_auto; rocsparse_int norm_check = 0; rocsparse_int unit_check = 1; @@ -605,6 +606,7 @@ class Arguments rocsparse_int iters = 10; rocsparse_int laplacian = 0; + rocsparse_int ell_width = 0; std::string filename = ""; @@ -619,6 +621,7 @@ class Arguments trans = rhs.trans; idx_base = rhs.idx_base; + part = rhs.part; norm_check = rhs.norm_check; unit_check = rhs.unit_check; @@ -626,6 +629,7 @@ class Arguments iters = rhs.iters; laplacian = rhs.laplacian; + ell_width = rhs.ell_width; filename = rhs.filename; From 9d7679a249f0b947f513dfd5ec967407098561b5 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 25 May 2018 18:00:19 +0200 Subject: [PATCH 083/304] added hyb_partition_auto and hyb_partition_user for csr2hyb and hybmv - still experimental --- library/src/conversion/csr2hyb_device.h | 128 +++++++++++++++-- library/src/conversion/rocsparse_csr2hyb.cpp | 142 ++++++++++++++++--- library/src/level2/rocsparse_hybmv.cpp | 100 ++++++++++++- 3 files changed, 332 insertions(+), 38 deletions(-) diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index 5cbd08f7..57042c3f 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -10,6 +10,96 @@ #include +template +__device__ void sum_reduce(rocsparse_int tid, rocsparse_int* data) +{ + __syncthreads(); + + for(int i = NB >> 1; i > 0; i >>= 1) + { + if(tid < i) + { + data[tid] += data[tid + i]; + } + + __syncthreads(); + } +} + +template +__global__ void +hyb_coo_nnz_part1(rocsparse_int m, rocsparse_int ell_width, const rocsparse_int* csr_row_ptr, rocsparse_int* workspace, rocsparse_int* coo_row_nnz) +{ + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + __shared__ rocsparse_int sdata[NB]; + + if(gid < m) + { + rocsparse_int row_nnz = csr_row_ptr[gid + 1] - csr_row_ptr[gid]; + + if (row_nnz > ell_width) + { + row_nnz = row_nnz - ell_width; + sdata[tid] = row_nnz; + coo_row_nnz[gid] = row_nnz; + } + else + { + sdata[tid] = 0; + coo_row_nnz[gid] = 0; + } + } + else + { + sdata[tid] = 0; + } + + sum_reduce(tid, sdata); + + if(tid == 0) + { + workspace[hipBlockIdx_x] = sdata[0]; + } +} + +template +__global__ void hyb_coo_nnz_part2(rocsparse_int m, rocsparse_int* workspace) +{ + rocsparse_int tid = hipThreadIdx_x; + + __shared__ rocsparse_int sdata[NB]; + sdata[tid] = 0; + + for(rocsparse_int i = tid; i < m; i += NB) + { + sdata[tid] += workspace[i]; + } + + __syncthreads(); + + if(m < 32) + { + if(tid == 0) + { + for(rocsparse_int i = 1; i < m; ++i) + { + sdata[0] += sdata[i]; + } + } + } + else + { + sum_reduce(tid, sdata); + } + + if(tid == 0) + { + workspace[0] = sdata[0]; + } +} + template __device__ void ell_width_reduce(rocsparse_int tid, rocsparse_int* data) { @@ -95,7 +185,11 @@ __global__ void csr2ell_kernel(rocsparse_int m, const rocsparse_int* csr_col_ind, rocsparse_int ell_width, rocsparse_int* ell_col_ind, - T* ell_val) + T* ell_val, + rocsparse_int* coo_row_ind, + rocsparse_int* coo_col_ind, + T* coo_val, + rocsparse_int* workspace) { rocsparse_int ai = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -104,26 +198,32 @@ __global__ void csr2ell_kernel(rocsparse_int m, return; } - rocsparse_int p = 0; - rocsparse_int aj = csr_row_ptr[ai]; + rocsparse_int p = 0; + + rocsparse_int row_begin = csr_row_ptr[ai]; + rocsparse_int row_end = csr_row_ptr[ai + 1]; + rocsparse_int coo_idx = workspace[ai]; - // Fill ELL matrix - for(; aj < csr_row_ptr[ai + 1]; ++aj) + // Fill HYB matrix + for(rocsparse_int aj = row_begin; aj < row_end; ++aj) { - if(p >= ell_width) + if (p < ell_width) { - break; + rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); + ell_col_ind[idx] = csr_col_ind[aj]; + ell_val[idx] = csr_val[aj]; + } + else + { + coo_row_ind[coo_idx] = ai; + coo_col_ind[coo_idx] = csr_col_ind[aj]; + coo_val[coo_idx] = csr_val[aj]; + ++coo_idx; } - - rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); - ell_col_ind[idx] = csr_col_ind[aj]; - ell_val[idx] = csr_val[aj]; } - // TODO store rownnz - // Pad remaining ELL structure - for(; aj < ell_width; ++aj) + for(rocsparse_int aj = row_end - row_begin; aj < ell_width; ++aj) { rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); ell_col_ind[idx] = -1; diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp index 04516e81..3aa2191f 100644 --- a/library/src/conversion/rocsparse_csr2hyb.cpp +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -49,20 +49,22 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, user_ell_width, partition_type); - // Check matrix type + // Check index base if(descr->base != rocsparse_index_base_zero) { // TODO return rocsparse_status_not_implemented; } + // Check matrix type if(descr->type != rocsparse_matrix_type_general) { // TODO return rocsparse_status_not_implemented; } - if(partition_type != rocsparse_hyb_partition_max) + // Check partition type + if(partition_type != rocsparse_hyb_partition_max && partition_type != rocsparse_hyb_partition_user && partition_type != rocsparse_hyb_partition_auto) { - return rocsparse_status_not_implemented; + return rocsparse_status_invalid_value; } // Check sizes @@ -95,6 +97,26 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, return rocsparse_status_success; } + // Check user_ell_width + if(partition_type == rocsparse_hyb_partition_user) + { + // ELL width cannot be 0 or negative + if(user_ell_width < 0) + { + return rocsparse_status_invalid_value; + } + + // Limit ELL allocation to two times the CSR non-zero elements + rocsparse_int csr_nnz; + RETURN_IF_HIP_ERROR(hipMemcpy(&csr_nnz, csr_row_ptr+m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + rocsparse_int max_row_nnz = (2 * csr_nnz - 1) / m + 1; + if(user_ell_width > max_row_nnz) + { + return rocsparse_status_invalid_value; + } + } + // Stream hipStream_t stream = handle->stream; @@ -127,17 +149,30 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, RETURN_IF_HIP_ERROR(hipFree(hyb->coo_val)); } -#define CSR2ELL_DIM 512 - // TODO we take max partition - if(partition_type == rocsparse_hyb_partition_max) - { - // ELL part only, compute maximum non-zeros per row - rocsparse_int blocks = handle->warp_size; + // Determine ELL width + rocsparse_int csr_nnz; + RETURN_IF_HIP_ERROR(hipMemcpy(&csr_nnz, csr_row_ptr+m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - // Allocate workspace - rocsparse_int* workspace = NULL; - RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(rocsparse_int) * blocks)); +#define CSR2ELL_DIM 512 + // Workspace size + rocsparse_int blocks = (m - 1) / CSR2ELL_DIM + 1; + // Allocate workspace + rocsparse_int* workspace = NULL; + RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(rocsparse_int) * blocks)); + if(partition_type == rocsparse_hyb_partition_user) + { + // ELL width given by user + hyb->ell_width = user_ell_width; + } + else if(partition_type == rocsparse_hyb_partition_auto) + { + // ELL width determined by average nnz per row + hyb->ell_width = (csr_nnz - 1) / m + 1; + } + else + { + // HYB == ELL - no COO part - compute maximum nnz per row hipLaunchKernelGGL((ell_width_kernel_part1), dim3(blocks), dim3(CSR2ELL_DIM), @@ -154,16 +189,9 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, stream, blocks, workspace); - // Copy ell width back to host RETURN_IF_HIP_ERROR( hipMemcpy(&hyb->ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - RETURN_IF_HIP_ERROR(hipFree(workspace)); - } - else - { - // TODO - return rocsparse_status_not_implemented; } // Compute ELL non-zeros @@ -173,6 +201,71 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->ell_col_ind, sizeof(rocsparse_int) * hyb->ell_nnz)); RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T) * hyb->ell_nnz)); + // Allocate workspace2 + rocsparse_int* workspace2 = NULL; + RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace2, sizeof(rocsparse_int) * (m + 1))); + + // If there is a COO part, compute the COO non-zero elements per row + if(partition_type != rocsparse_hyb_partition_max) + { + // If there is no ELL part, its easy... + if(hyb->ell_nnz == 0) + { + hyb->coo_nnz = csr_nnz; + RETURN_IF_HIP_ERROR(hipMemcpy(workspace2, csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToDevice)); + } + else + { + hipLaunchKernelGGL((hyb_coo_nnz_part1), + dim3((m - 1) / CSR2ELL_DIM + 1), + dim3(CSR2ELL_DIM), + 0, + stream, + m, + hyb->ell_width, + csr_row_ptr, + workspace, + workspace2); + + hipLaunchKernelGGL((hyb_coo_nnz_part2), + dim3(1), + dim3(CSR2ELL_DIM), + 0, + stream, + blocks, + workspace); + + RETURN_IF_HIP_ERROR( + hipMemcpy(&hyb->coo_nnz, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + // Perform exclusive scan on workspace TODO use rocPRIM + std::vector hbuf(m+1); + RETURN_IF_HIP_ERROR(hipMemcpy(hbuf.data() + 1, workspace2, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); + + hbuf[0] = 0; + for (rocsparse_int i = 0; i < m; ++i) + { + hbuf[i+1] += hbuf[i]; + } + + RETURN_IF_HIP_ERROR(hipMemcpy(workspace2, hbuf.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + } + + } + + + RETURN_IF_HIP_ERROR(hipFree(workspace)); + + + + // Allocate COO part + RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->coo_row_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->coo_col_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc(&hyb->coo_val, sizeof(T) * hyb->coo_nnz)); + + + + dim3 csr2ell_blocks((m - 1) / CSR2ELL_DIM + 1); dim3 csr2ell_threads(CSR2ELL_DIM); @@ -187,8 +280,17 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, csr_col_ind, hyb->ell_width, hyb->ell_col_ind, - (T*)hyb->ell_val); + (T*)hyb->ell_val, + hyb->coo_row_ind, + hyb->coo_col_ind, + (T*)hyb->coo_val, + workspace2); + + + + RETURN_IF_HIP_ERROR(hipFree(workspace2)); #undef CSR2ELL_DIM + return rocsparse_status_success; } diff --git a/library/src/level2/rocsparse_hybmv.cpp b/library/src/level2/rocsparse_hybmv.cpp index c8bf4592..ffbf5a51 100644 --- a/library/src/level2/rocsparse_hybmv.cpp +++ b/library/src/level2/rocsparse_hybmv.cpp @@ -3,8 +3,10 @@ * ************************************************************************ */ #include "rocsparse.h" +#include "definitions.h" #include "handle.h" #include "utility.h" +#include "coomv_device.h" #include "ellmv_device.h" #include @@ -39,6 +41,32 @@ __global__ void ellmvn_kernel_device_pointer(rocsparse_int m, ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y, idx_base); } +template +__global__ void coomvn_warp_host_pointer(rocsparse_int nnz, + rocsparse_int loops, + T alpha, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* coo_val, + const T* x, + T* y, + rocsparse_int* row_block_red, + T* val_block_red, + rocsparse_index_base idx_base) +{ + coomvn_general_warp_reduce(nnz, + loops, + alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + idx_base); +} + template rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, rocsparse_operation trans, @@ -100,10 +128,11 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, // TODO return rocsparse_status_not_implemented; } - if(hyb->partition != rocsparse_hyb_partition_max) - { - return rocsparse_status_not_implemented; - } +// TODO check partition type +// if(hyb->partition != rocsparse_hyb_partition_max) +// { +// return rocsparse_status_not_implemented; +// } // Check sizes if(hyb->m < 0) @@ -215,6 +244,69 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, *beta, y, descr->base); + } + + // COO part + if(hyb->coo_nnz > 0) + { +// TODO +#define COOMVN_DIM 128 + rocsparse_int maxthreads = handle->properties.maxThreadsPerBlock; + rocsparse_int nprocs = handle->properties.multiProcessorCount; + rocsparse_int maxblocks = (nprocs * maxthreads - 1) / COOMVN_DIM + 1; + rocsparse_int minblocks = (hyb->coo_nnz - 1) / COOMVN_DIM + 1; + + rocsparse_int nblocks = maxblocks < minblocks ? maxblocks : minblocks; + rocsparse_int nwarps = nblocks * (COOMVN_DIM / handle->warp_size); + rocsparse_int nloops = (hyb->coo_nnz / handle->warp_size + 1) / nwarps + 1; + + dim3 coomvn_blocks(nblocks); + dim3 coomvn_threads(COOMVN_DIM); + + rocsparse_int* row_block_red = NULL; + T* val_block_red = NULL; + + RETURN_IF_HIP_ERROR(hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwarps)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwarps)); + + hipLaunchKernelGGL((coomvn_warp_host_pointer), + coomvn_blocks, + coomvn_threads, + 0, + stream, + hyb->coo_nnz, + nloops, + *alpha, + hyb->coo_row_ind, + hyb->coo_col_ind, + hyb->coo_val, + x, + y, + row_block_red, + val_block_red, + descr->base); + + hipLaunchKernelGGL((coomvn_general_block_reduce), + dim3(1), + coomvn_threads, + 0, + stream, + nwarps, + row_block_red, + val_block_red, + y); + + RETURN_IF_HIP_ERROR(hipFree(row_block_red)); + RETURN_IF_HIP_ERROR(hipFree(val_block_red)); +#undef COOMVN_DIM + + + + + + + + } } #undef ELLMVN_DIM From c0c37da10160c8214f13620141e8fb7c95495708 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 25 May 2018 18:01:46 +0200 Subject: [PATCH 084/304] csr2hyb tests covering all cases --- .../rocsparse_template_specialization.cpp | 30 ++ clients/include/rocsparse.hpp | 12 + clients/include/testing_csr2hyb.hpp | 395 ++++++++++++++++++ clients/tests/CMakeLists.txt | 1 + clients/tests/test_csr2hyb.cpp | 70 ++++ 5 files changed, 508 insertions(+) create mode 100644 clients/include/testing_csr2hyb.hpp create mode 100644 clients/tests/test_csr2hyb.cpp diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 386176c1..e4f41ee6 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -134,4 +134,34 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, return rocsparse_dhybmv(handle, trans, alpha, descr, hyb, x, beta, y); } +template <> +rocsparse_status rocsparse_csr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type) +{ + return rocsparse_scsr2hyb(handle, m, n, descr, csr_val, csr_row_ptr, csr_col_ind, hyb, user_ell_width, partition_type); +} + +template <> +rocsparse_status rocsparse_csr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type) +{ + return rocsparse_dcsr2hyb(handle, m, n, descr, csr_val, csr_row_ptr, csr_col_ind, hyb, user_ell_width, partition_type); +} + } // namespace rocsparse diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index 7157c00a..47858b7f 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -57,6 +57,18 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, const T* x, const T* beta, T* y); + +template +rocsparse_status rocsparse_csr2hyb(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type); } #endif // _ROCSPARSE_HPP_ diff --git a/clients/include/testing_csr2hyb.hpp b/clients/include/testing_csr2hyb.hpp new file mode 100644 index 00000000..42448e90 --- /dev/null +++ b/clients/include/testing_csr2hyb.hpp @@ -0,0 +1,395 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_CSR2HYB_HPP +#define TESTING_CSR2HYB_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +#define ELL_IND_ROW(i, el, m, width) (el) * (m) + (i) +#define ELL_IND_EL(i, el, m, width) (el) + (width) * (i) +#define ELL_IND(i, el, m, width) ELL_IND_ROW(i, el, m, width) + +struct test_hyb +{ + rocsparse_int m; + rocsparse_int n; + rocsparse_hyb_partition partition; + rocsparse_int ell_nnz; + rocsparse_int ell_width; + rocsparse_int* ell_col_ind; + void* ell_val; + rocsparse_int coo_nnz; + rocsparse_int* coo_row_ind; + rocsparse_int* coo_col_ind; + void* coo_val; +}; + +template +void testing_csr2hyb_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int n = 100; + rocsparse_int safe_size = 100; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + std::unique_ptr unique_ptr_hyb(new hyb_struct); + rocsparse_hyb_mat hyb = unique_ptr_hyb->hyb; + + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); + + if(!csr_row_ptr || !csr_col_ind || !csr_val) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // Testing for(csr_row_ptr == nullptr) + { + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_csr2hyb(handle, m, n, descr, csr_val, csr_row_ptr_null, csr_col_ind, hyb, 0, rocsparse_hyb_partition_auto); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + // Testing for(csr_col_ind == nullptr) + { + rocsparse_int* csr_col_ind_null = nullptr; + + status = rocsparse_csr2hyb(handle, m, n, descr, csr_val, csr_row_ptr, csr_col_ind_null, hyb, 0, rocsparse_hyb_partition_auto); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); + } + // Testing for(csr_val == nullptr) + { + T* csr_val_null = nullptr; + + status = rocsparse_csr2hyb(handle, m, n, descr, csr_val_null, csr_row_ptr, csr_col_ind, hyb, 0, rocsparse_hyb_partition_auto); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_val is nullptr"); + } + // Testing for(handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csr2hyb(handle_null, m, n, descr, csr_val, csr_row_ptr, csr_col_ind, hyb, 0, rocsparse_hyb_partition_auto); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_csr2hyb(Arguments argus) +{ + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_hyb_partition part = argus.part; + rocsparse_int user_ell_width = argus.ell_width; + rocsparse_status status; + + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + std::unique_ptr unique_ptr_hyb(new hyb_struct); + rocsparse_hyb_mat hyb = unique_ptr_hyb->hyb; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); + + if(!csr_row_ptr || !csr_col_ind || !csr_val) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!csr_row_ptr || !csr_col_ind || !csr_val"); + return rocsparse_status_memory_error; + } + + status = rocsparse_csr2hyb(handle, m, n, descr, csr_val, csr_row_ptr, csr_col_ind, hyb, user_ell_width, part); + + if(m < 0 || n < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0"); + } + + return rocsparse_status_success; + } + + // For testing, assemble a COO matrix and convert it to CSR first (on host) + + // Host structures + std::vector hcoo_row_ind(nnz); + std::vector hcsr_col_ind(nnz); + std::vector hcsr_val(nnz); + + // Sample initial COO matrix on CPU + srand(12345ULL); + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base); + + // Convert COO to CSR + std::vector hcsr_row_ptr(m + 1); + + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } + + // Allocate memory on the device + auto dcsr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcsr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcsr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + + rocsparse_int* dcsr_row_ptr = (rocsparse_int*)dcsr_row_ptr_managed.get(); + rocsparse_int* dcsr_col_ind = (rocsparse_int*)dcsr_col_ind_managed.get(); + T* dcsr_val = (T*)dcsr_val_managed.get(); + + if(!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val"); + return rocsparse_status_memory_error; + } + + // Copy data from host to device + CHECK_HIP_ERROR(hipMemcpy( + dcsr_row_ptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy( + dcsr_col_ind, hcsr_col_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy( + dcsr_val, hcsr_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + + // User given ELL width check + if(part == rocsparse_hyb_partition_user) + { + // ELL width -33 means we take a reasonable pre-computed width + if(user_ell_width == -33) + { + user_ell_width = nnz / m; + } + + // Test invalid user_ell_width + rocsparse_int max_allowed_ell_nnz_per_row = (2 * nnz - 1) / m + 1; + if(user_ell_width < 0 || user_ell_width > max_allowed_ell_nnz_per_row) + { + status = rocsparse_csr2hyb(handle, m, n, descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, hyb, user_ell_width, part); + + verify_rocsparse_status_invalid_value(status, "Error: user_ell_width < 0 || user_ell_width > max_ell_width"); + + return rocsparse_status_success; + } + } + + // Host structures for verification + std::vector hhyb_ell_col_ind_gold; + std::vector hhyb_ell_val_gold; + std::vector hhyb_coo_row_ind_gold; + std::vector hhyb_coo_col_ind_gold; + std::vector hhyb_coo_val_gold; + + // Host csr2hyb conversion + rocsparse_int ell_width = 0; + rocsparse_int ell_nnz = 0; + rocsparse_int coo_nnz = 0; + + if(part == rocsparse_hyb_partition_auto || part == rocsparse_hyb_partition_user) + { + if(part == rocsparse_hyb_partition_auto) + { + // ELL width is average nnz per row + ell_width = (nnz - 1) / m + 1; + } + else + { + // User given ELL width + ell_width = user_ell_width; + } + + ell_nnz = ell_width * m; + + // Determine COO nnz + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int row_nnz = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; + if(row_nnz > ell_width) + { + coo_nnz += row_nnz - ell_width; + } + } + } + else if(part == rocsparse_hyb_partition_max) + { + // Determine max nnz per row + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int row_nnz = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; + ell_width = (row_nnz > ell_width) ? row_nnz : ell_width; + } + ell_nnz = ell_width * m; + } + + // Allocate host memory + // ELL + hhyb_ell_col_ind_gold.resize(ell_nnz); + hhyb_ell_val_gold.resize(ell_nnz); + // COO + hhyb_coo_row_ind_gold.resize(coo_nnz); + hhyb_coo_col_ind_gold.resize(coo_nnz); + hhyb_coo_val_gold.resize(coo_nnz); + + // Fill HYB + rocsparse_int coo_idx = 0; + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int p = 0; + for(rocsparse_int j = hcsr_row_ptr[i]; j < hcsr_row_ptr[i + 1]; ++j) + { + if(p < ell_width) + { + rocsparse_int idx = ELL_IND(i, p++, m, ell_width); + hhyb_ell_col_ind_gold[idx] = hcsr_col_ind[j]; + hhyb_ell_val_gold[idx] = hcsr_val[j]; + } + else + { + hhyb_coo_row_ind_gold[coo_idx] = i; + hhyb_coo_col_ind_gold[coo_idx] = hcsr_col_ind[j]; + hhyb_coo_val_gold[coo_idx] = hcsr_val[j]; + ++coo_idx; + } + } + for(rocsparse_int j = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; j < ell_width; ++j) + { + rocsparse_int idx = ELL_IND(i, p++, m, ell_width); + hhyb_ell_col_ind_gold[idx] = -1; + hhyb_ell_val_gold[idx] = static_cast(0); + } + } + + + + + + // Allocate verification structures + std::vector hhyb_ell_col_ind(ell_nnz); + std::vector hhyb_ell_val(ell_nnz); + std::vector hhyb_coo_row_ind(coo_nnz); + std::vector hhyb_coo_col_ind(coo_nnz); + std::vector hhyb_coo_val(coo_nnz); + + if(argus.unit_check) + { + CHECK_ROCSPARSE_ERROR(rocsparse_csr2hyb(handle, m, n, descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, hyb, user_ell_width, part)); + + // Copy output from device to host + test_hyb *dhyb = (test_hyb*)hyb; + + // Check if sizes match + unit_check_general(1, 1, &dhyb->m, &m); + unit_check_general(1, 1, &dhyb->n, &n); + unit_check_general(1, 1, &dhyb->ell_width, &ell_width); + unit_check_general(1, 1, &dhyb->ell_nnz, &ell_nnz); + unit_check_general(1, 1, &dhyb->coo_nnz, &coo_nnz); + + CHECK_HIP_ERROR(hipMemcpy(hhyb_ell_col_ind.data(), dhyb->ell_col_ind, sizeof(rocsparse_int) * ell_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hhyb_ell_val.data(), dhyb->ell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hhyb_coo_row_ind.data(), dhyb->coo_row_ind, sizeof(rocsparse_int) * coo_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hhyb_coo_col_ind.data(), dhyb->coo_col_ind, sizeof(rocsparse_int) * coo_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hhyb_coo_val.data(), dhyb->coo_val, sizeof(T) * coo_nnz, hipMemcpyDeviceToHost)); + + // Unit check + unit_check_general(1, ell_nnz, hhyb_ell_col_ind_gold.data(), hhyb_ell_col_ind.data()); + unit_check_general(1, ell_nnz, hhyb_ell_val_gold.data(), hhyb_ell_val.data()); + unit_check_general(1, coo_nnz, hhyb_coo_row_ind_gold.data(), hhyb_coo_row_ind.data()); + unit_check_general(1, coo_nnz, hhyb_coo_col_ind_gold.data(), hhyb_coo_col_ind.data()); + unit_check_general(1, coo_nnz, hhyb_coo_val_gold.data(), hhyb_coo_val.data()); + } + + + + + + + +/* + if(argus.timing) + { + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; + + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_csr2hyb(handle, dcsr_row_ptr, nnz, m, dhyb_row_ind, idx_base); + } + + double gpu_time_used = get_time_us(); + + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_csr2hyb(handle, dcsr_row_ptr, nnz, m, dhyb_row_ind, idx_base); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + + double bandwidth = sizeof(rocsparse_int) * (nnz + m + 1) / gpu_time_used / 1e6; + + printf("m\t\tn\t\tnnz\t\tGB/s\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\n", m, n, nnz, bandwidth, gpu_time_used); + } +*/ + return rocsparse_status_success; +} + +#endif // TESTING_CSR2HYB_HPP diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index c0a0e811..de9d2bb4 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -10,6 +10,7 @@ set(ROCSPARSE_TEST_SOURCES test_coomv.cpp test_csrmv.cpp test_csr2coo.cpp + test_csr2hyb.cpp test_coo2csr.cpp ) diff --git a/clients/tests/test_csr2hyb.cpp b/clients/tests/test_csr2hyb.cpp new file mode 100644 index 00000000..3fc332ea --- /dev/null +++ b/clients/tests/test_csr2hyb.cpp @@ -0,0 +1,70 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_csr2hyb.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef std::tuple csr2hyb_tuple; + +int csr2hyb_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int csr2hyb_N_range[] = {-3, 0, 33, 242, 623, 1000}; + +rocsparse_index_base csr2hyb_idx_base_range[] = {rocsparse_index_base_zero}; //TODO + +rocsparse_hyb_partition csr2hyb_partition[] = {rocsparse_hyb_partition_auto, + rocsparse_hyb_partition_max, + rocsparse_hyb_partition_user}; + +int csr2hyb_ELL_range[] = {-33, -1, 0, INT32_MAX}; + +class parameterized_csr2hyb : public testing::TestWithParam +{ + protected: + parameterized_csr2hyb() {} + virtual ~parameterized_csr2hyb() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csr2hyb_arguments(csr2hyb_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.part = std::get<3>(tup); + arg.ell_width = std::get<4>(tup); + arg.timing = 0; + return arg; +} + +TEST(csr2hyb_bad_arg, csr2hyb) { testing_csr2hyb_bad_arg(); } + +TEST_P(parameterized_csr2hyb, csr2hyb_float) +{ + Arguments arg = setup_csr2hyb_arguments(GetParam()); + + rocsparse_status status = testing_csr2hyb(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csr2hyb, csr2hyb_double) +{ + Arguments arg = setup_csr2hyb_arguments(GetParam()); + + rocsparse_status status = testing_csr2hyb(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csr2hyb, + parameterized_csr2hyb, + testing::Combine(testing::ValuesIn(csr2hyb_M_range), + testing::ValuesIn(csr2hyb_N_range), + testing::ValuesIn(csr2hyb_idx_base_range), + testing::ValuesIn(csr2hyb_partition), + testing::ValuesIn(csr2hyb_ELL_range))); From 4a48e18cd9b341831e074c6f972cd3ffcadeec42 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 25 May 2018 23:20:42 +0200 Subject: [PATCH 085/304] compiler warning fix --- library/src/level2/rocsparse_hybmv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/level2/rocsparse_hybmv.cpp b/library/src/level2/rocsparse_hybmv.cpp index ffbf5a51..d5ce0f36 100644 --- a/library/src/level2/rocsparse_hybmv.cpp +++ b/library/src/level2/rocsparse_hybmv.cpp @@ -279,7 +279,7 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, *alpha, hyb->coo_row_ind, hyb->coo_col_ind, - hyb->coo_val, + (T*)hyb->coo_val, x, y, row_block_red, From cf7587541d544e30c339589a8b0a04c0c0622740 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 25 May 2018 23:21:03 +0200 Subject: [PATCH 086/304] csr2hyb: fixed a bug with uninitialized value --- library/src/conversion/csr2hyb_device.h | 4 ++-- library/src/conversion/rocsparse_csr2hyb.cpp | 16 +++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index 57042c3f..aad0aa3f 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -202,8 +202,8 @@ __global__ void csr2ell_kernel(rocsparse_int m, rocsparse_int row_begin = csr_row_ptr[ai]; rocsparse_int row_end = csr_row_ptr[ai + 1]; - rocsparse_int coo_idx = workspace[ai]; - + rocsparse_int coo_idx = coo_row_ind ? workspace[ai] : 0; + // Fill HYB matrix for(rocsparse_int aj = row_begin; aj < row_end; ++aj) { diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp index 3aa2191f..62caf9de 100644 --- a/library/src/conversion/rocsparse_csr2hyb.cpp +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -198,8 +198,11 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hyb->ell_nnz = hyb->ell_width * m; // Allocate ELL part - RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->ell_col_ind, sizeof(rocsparse_int) * hyb->ell_nnz)); - RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T) * hyb->ell_nnz)); + if(hyb->ell_nnz > 0) + { + RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->ell_col_ind, sizeof(rocsparse_int) * hyb->ell_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T) * hyb->ell_nnz)); + } // Allocate workspace2 rocsparse_int* workspace2 = NULL; @@ -259,9 +262,12 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, // Allocate COO part - RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->coo_row_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); - RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->coo_col_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); - RETURN_IF_HIP_ERROR(hipMalloc(&hyb->coo_val, sizeof(T) * hyb->coo_nnz)); + if(hyb->coo_nnz > 0) + { + RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->coo_row_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->coo_col_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc(&hyb->coo_val, sizeof(T) * hyb->coo_nnz)); + } From b3e387b51116d29b61a24fa0e89f538551bed787 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sat, 26 May 2018 22:58:39 +0200 Subject: [PATCH 087/304] csr2hyb: support for index_base_one --- clients/include/testing_csr2hyb.hpp | 19 +++++++++------- clients/tests/test_csr2hyb.cpp | 3 ++- library/src/conversion/csr2hyb_device.h | 13 ++++++----- library/src/conversion/rocsparse_csr2hyb.cpp | 23 ++++++++++---------- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/clients/include/testing_csr2hyb.hpp b/clients/include/testing_csr2hyb.hpp index 42448e90..90fbfa19 100644 --- a/clients/include/testing_csr2hyb.hpp +++ b/clients/include/testing_csr2hyb.hpp @@ -123,6 +123,9 @@ rocsparse_status testing_csr2hyb(Arguments argus) std::unique_ptr unique_ptr_descr(new descr_struct); rocsparse_mat_descr descr = unique_ptr_descr->descr; + // Set matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); + std::unique_ptr unique_ptr_hyb(new hyb_struct); rocsparse_hyb_mat hyb = unique_ptr_hyb->hyb; @@ -294,18 +297,18 @@ rocsparse_status testing_csr2hyb(Arguments argus) for(rocsparse_int i = 0; i < m; ++i) { rocsparse_int p = 0; - for(rocsparse_int j = hcsr_row_ptr[i]; j < hcsr_row_ptr[i + 1]; ++j) + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; ++j) { if(p < ell_width) { rocsparse_int idx = ELL_IND(i, p++, m, ell_width); - hhyb_ell_col_ind_gold[idx] = hcsr_col_ind[j]; + hhyb_ell_col_ind_gold[idx] = hcsr_col_ind[j] - idx_base; hhyb_ell_val_gold[idx] = hcsr_val[j]; } else { hhyb_coo_row_ind_gold[coo_idx] = i; - hhyb_coo_col_ind_gold[coo_idx] = hcsr_col_ind[j]; + hhyb_coo_col_ind_gold[coo_idx] = hcsr_col_ind[j] - idx_base; hhyb_coo_val_gold[coo_idx] = hcsr_val[j]; ++coo_idx; } @@ -337,11 +340,11 @@ rocsparse_status testing_csr2hyb(Arguments argus) test_hyb *dhyb = (test_hyb*)hyb; // Check if sizes match - unit_check_general(1, 1, &dhyb->m, &m); - unit_check_general(1, 1, &dhyb->n, &n); - unit_check_general(1, 1, &dhyb->ell_width, &ell_width); - unit_check_general(1, 1, &dhyb->ell_nnz, &ell_nnz); - unit_check_general(1, 1, &dhyb->coo_nnz, &coo_nnz); + unit_check_general(1, 1, &m, &dhyb->m); + unit_check_general(1, 1, &n, &dhyb->n); + unit_check_general(1, 1, &ell_width, &dhyb->ell_width); + unit_check_general(1, 1, &ell_nnz, &dhyb->ell_nnz); + unit_check_general(1, 1, &coo_nnz, &dhyb->coo_nnz); CHECK_HIP_ERROR(hipMemcpy(hhyb_ell_col_ind.data(), dhyb->ell_col_ind, sizeof(rocsparse_int) * ell_nnz, hipMemcpyDeviceToHost)); CHECK_HIP_ERROR(hipMemcpy(hhyb_ell_val.data(), dhyb->ell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); diff --git a/clients/tests/test_csr2hyb.cpp b/clients/tests/test_csr2hyb.cpp index 3fc332ea..49d15867 100644 --- a/clients/tests/test_csr2hyb.cpp +++ b/clients/tests/test_csr2hyb.cpp @@ -14,7 +14,8 @@ typedef std::tuple int csr2hyb_M_range[] = {-1, 0, 10, 500, 872, 1000}; int csr2hyb_N_range[] = {-3, 0, 33, 242, 623, 1000}; -rocsparse_index_base csr2hyb_idx_base_range[] = {rocsparse_index_base_zero}; //TODO +rocsparse_index_base csr2hyb_idx_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; rocsparse_hyb_partition csr2hyb_partition[] = {rocsparse_hyb_partition_auto, rocsparse_hyb_partition_max, diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index aad0aa3f..a7394c87 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -189,7 +189,8 @@ __global__ void csr2ell_kernel(rocsparse_int m, rocsparse_int* coo_row_ind, rocsparse_int* coo_col_ind, T* coo_val, - rocsparse_int* workspace) + rocsparse_int* workspace, + rocsparse_int idx_base) { rocsparse_int ai = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -200,9 +201,9 @@ __global__ void csr2ell_kernel(rocsparse_int m, rocsparse_int p = 0; - rocsparse_int row_begin = csr_row_ptr[ai]; - rocsparse_int row_end = csr_row_ptr[ai + 1]; - rocsparse_int coo_idx = coo_row_ind ? workspace[ai] : 0; + rocsparse_int row_begin = csr_row_ptr[ai] - idx_base; + rocsparse_int row_end = csr_row_ptr[ai + 1] - idx_base; + rocsparse_int coo_idx = coo_row_ind ? workspace[ai] - idx_base : 0; // Fill HYB matrix for(rocsparse_int aj = row_begin; aj < row_end; ++aj) @@ -210,13 +211,13 @@ __global__ void csr2ell_kernel(rocsparse_int m, if (p < ell_width) { rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); - ell_col_ind[idx] = csr_col_ind[aj]; + ell_col_ind[idx] = csr_col_ind[aj] - idx_base; ell_val[idx] = csr_val[aj]; } else { coo_row_ind[coo_idx] = ai; - coo_col_ind[coo_idx] = csr_col_ind[aj]; + coo_col_ind[coo_idx] = csr_col_ind[aj] - idx_base; coo_val[coo_idx] = csr_val[aj]; ++coo_idx; } diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp index 62caf9de..ccccefe6 100644 --- a/library/src/conversion/rocsparse_csr2hyb.cpp +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -50,10 +50,9 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, partition_type); // Check index base - if(descr->base != rocsparse_index_base_zero) + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) { - // TODO - return rocsparse_status_not_implemented; + return rocsparse_status_invalid_value; } // Check matrix type if(descr->type != rocsparse_matrix_type_general) @@ -97,6 +96,13 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, return rocsparse_status_success; } + // Get number of CSR non-zeros + rocsparse_int csr_nnz; + RETURN_IF_HIP_ERROR(hipMemcpy(&csr_nnz, csr_row_ptr+m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + // Correct by index base + csr_nnz -= descr->base; + // Check user_ell_width if(partition_type == rocsparse_hyb_partition_user) { @@ -106,10 +112,6 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, return rocsparse_status_invalid_value; } - // Limit ELL allocation to two times the CSR non-zero elements - rocsparse_int csr_nnz; - RETURN_IF_HIP_ERROR(hipMemcpy(&csr_nnz, csr_row_ptr+m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - rocsparse_int max_row_nnz = (2 * csr_nnz - 1) / m + 1; if(user_ell_width > max_row_nnz) { @@ -150,8 +152,6 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, } // Determine ELL width - rocsparse_int csr_nnz; - RETURN_IF_HIP_ERROR(hipMemcpy(&csr_nnz, csr_row_ptr+m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); #define CSR2ELL_DIM 512 // Workspace size @@ -245,7 +245,7 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, std::vector hbuf(m+1); RETURN_IF_HIP_ERROR(hipMemcpy(hbuf.data() + 1, workspace2, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); - hbuf[0] = 0; + hbuf[0] = descr->base; for (rocsparse_int i = 0; i < m; ++i) { hbuf[i+1] += hbuf[i]; @@ -290,7 +290,8 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hyb->coo_row_ind, hyb->coo_col_ind, (T*)hyb->coo_val, - workspace2); + workspace2, + descr->base); From 77d8018f2a8ffe73dbf93a60c4707df734c172c4 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sat, 26 May 2018 22:59:56 +0200 Subject: [PATCH 088/304] hybmv: test preparations --- clients/include/testing_hybmv.hpp | 107 ++++++++++++++++++++++++++++++ clients/tests/CMakeLists.txt | 1 + clients/tests/test_hybmv.cpp | 77 +++++++++++++++++++++ 3 files changed, 185 insertions(+) create mode 100644 clients/include/testing_hybmv.hpp create mode 100644 clients/tests/test_hybmv.cpp diff --git a/clients/include/testing_hybmv.hpp b/clients/include/testing_hybmv.hpp new file mode 100644 index 00000000..c6bee286 --- /dev/null +++ b/clients/include/testing_hybmv.hpp @@ -0,0 +1,107 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_HYBMV_HPP +#define TESTING_HYBMV_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_hybmv_bad_arg(void) +{ + rocsparse_int safe_size = 100; + T alpha = 0.6; + T beta = 0.2; + rocsparse_operation trans = rocsparse_operation_none; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + std::unique_ptr unique_ptr_hyb(new hyb_struct); + rocsparse_hyb_mat hyb = unique_ptr_hyb->hyb; + + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for(nullptr == dx) + { + T* dx_null = nullptr; + + status = rocsparse_hybmv(handle, trans, &alpha, descr, hyb, dx_null, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); + } + // testing for(nullptr == dy) + { + T* dy_null = nullptr; + + status = rocsparse_hybmv(handle, trans, &alpha, descr, hyb, dx, &beta, dy_null); + verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); + } + // testing for(nullptr == d_alpha) + { + T* d_alpha_null = nullptr; + + status = rocsparse_hybmv(handle, trans, d_alpha_null, descr, hyb, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); + } + // testing for(nullptr == d_beta) + { + T* d_beta_null = nullptr; + + status = rocsparse_hybmv(handle, trans, &alpha, descr, hyb, dx, d_beta_null, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); + } + // testing for(nullptr == hyb) + { + rocsparse_hyb_mat hyb_null = nullptr; + + status = rocsparse_hybmv(handle, trans, &alpha, descr, hyb_null, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_hybmv(handle, trans, &alpha, descr_null, hyb, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_hybmv(handle_null, trans, &alpha, descr, hyb, dx, &beta, dy); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_hybmv(Arguments argus) +{ + return rocsparse_status_success; +} + +#endif // TESTING_HYBMV_HPP diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index de9d2bb4..74cb50b2 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -9,6 +9,7 @@ set(ROCSPARSE_TEST_SOURCES test_axpyi.cpp test_coomv.cpp test_csrmv.cpp + test_hybmv.cpp test_csr2coo.cpp test_csr2hyb.cpp test_coo2csr.cpp diff --git a/clients/tests/test_hybmv.cpp b/clients/tests/test_hybmv.cpp new file mode 100644 index 00000000..db4376bd --- /dev/null +++ b/clients/tests/test_hybmv.cpp @@ -0,0 +1,77 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_hybmv.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef std::tuple hybmv_tuple; + +int hyb_M_range[] = {-1, 0, 10, 500, 7111, 10000}; +int hyb_N_range[] = {-3, 0, 33, 842, 4441, 10000}; + +std::vector hyb_alpha_range = {2.0, 3.0}; +std::vector hyb_beta_range = {0.0, 1.0}; + +rocsparse_index_base hyb_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +rocsparse_hyb_partition hyb_partition[] = {rocsparse_hyb_partition_auto, + rocsparse_hyb_partition_max, + rocsparse_hyb_partition_user}; + +int hyb_ELL_range[] = {-33, -1, 0, INT32_MAX}; + +class parameterized_hybmv : public testing::TestWithParam +{ + protected: + parameterized_hybmv() {} + virtual ~parameterized_hybmv() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_hybmv_arguments(hybmv_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.beta = std::get<3>(tup); + arg.idx_base = std::get<4>(tup); + arg.part = std::get<5>(tup); + arg.ell_width = std::get<6>(tup); + arg.timing = 0; + return arg; +} + +TEST(hybmv_bad_arg, hybmv_float) { testing_hybmv_bad_arg(); } + +TEST_P(parameterized_hybmv, hybmv_float) +{ + Arguments arg = setup_hybmv_arguments(GetParam()); + + rocsparse_status status = testing_hybmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_hybmv, hybmv_double) +{ + Arguments arg = setup_hybmv_arguments(GetParam()); + + rocsparse_status status = testing_hybmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(hybmv, + parameterized_hybmv, + testing::Combine(testing::ValuesIn(hyb_M_range), + testing::ValuesIn(hyb_N_range), + testing::ValuesIn(hyb_alpha_range), + testing::ValuesIn(hyb_beta_range), + testing::ValuesIn(hyb_idxbase_range), + testing::ValuesIn(hyb_partition), + testing::ValuesIn(hyb_ELL_range))); From 6ba4e0e410e01b5aa6134031a4173580b52a8cf7 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sat, 26 May 2018 23:14:24 +0200 Subject: [PATCH 089/304] clang-format --- .../rocsparse_template_specialization.cpp | 22 ++- clients/include/testing_csr2hyb.hpp | 165 ++++++++++++------ clients/include/testing_hybmv.hpp | 8 +- clients/tests/test_csr2hyb.cpp | 7 +- clients/tests/test_hybmv.cpp | 10 +- library/src/conversion/csr2hyb_device.h | 15 +- library/src/conversion/rocsparse_csr2hyb.cpp | 42 ++--- library/src/level2/rocsparse_hybmv.cpp | 67 ++++--- 8 files changed, 199 insertions(+), 137 deletions(-) diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index e4f41ee6..1331bdc9 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -146,7 +146,16 @@ rocsparse_status rocsparse_csr2hyb(rocsparse_handle handle, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type) { - return rocsparse_scsr2hyb(handle, m, n, descr, csr_val, csr_row_ptr, csr_col_ind, hyb, user_ell_width, partition_type); + return rocsparse_scsr2hyb(handle, + m, + n, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + hyb, + user_ell_width, + partition_type); } template <> @@ -161,7 +170,16 @@ rocsparse_status rocsparse_csr2hyb(rocsparse_handle handle, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type) { - return rocsparse_dcsr2hyb(handle, m, n, descr, csr_val, csr_row_ptr, csr_col_ind, hyb, user_ell_width, partition_type); + return rocsparse_dcsr2hyb(handle, + m, + n, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + hyb, + user_ell_width, + partition_type); } } // namespace rocsparse diff --git a/clients/include/testing_csr2hyb.hpp b/clients/include/testing_csr2hyb.hpp index 90fbfa19..e0d9f372 100644 --- a/clients/include/testing_csr2hyb.hpp +++ b/clients/include/testing_csr2hyb.hpp @@ -61,7 +61,7 @@ void testing_csr2hyb_bad_arg(void) rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); - T* csr_val = (T*)csr_val_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); if(!csr_row_ptr || !csr_col_ind || !csr_val) { @@ -73,28 +73,64 @@ void testing_csr2hyb_bad_arg(void) { rocsparse_int* csr_row_ptr_null = nullptr; - status = rocsparse_csr2hyb(handle, m, n, descr, csr_val, csr_row_ptr_null, csr_col_ind, hyb, 0, rocsparse_hyb_partition_auto); + status = rocsparse_csr2hyb(handle, + m, + n, + descr, + csr_val, + csr_row_ptr_null, + csr_col_ind, + hyb, + 0, + rocsparse_hyb_partition_auto); verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); } // Testing for(csr_col_ind == nullptr) { rocsparse_int* csr_col_ind_null = nullptr; - status = rocsparse_csr2hyb(handle, m, n, descr, csr_val, csr_row_ptr, csr_col_ind_null, hyb, 0, rocsparse_hyb_partition_auto); + status = rocsparse_csr2hyb(handle, + m, + n, + descr, + csr_val, + csr_row_ptr, + csr_col_ind_null, + hyb, + 0, + rocsparse_hyb_partition_auto); verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); } // Testing for(csr_val == nullptr) { T* csr_val_null = nullptr; - status = rocsparse_csr2hyb(handle, m, n, descr, csr_val_null, csr_row_ptr, csr_col_ind, hyb, 0, rocsparse_hyb_partition_auto); + status = rocsparse_csr2hyb(handle, + m, + n, + descr, + csr_val_null, + csr_row_ptr, + csr_col_ind, + hyb, + 0, + rocsparse_hyb_partition_auto); verify_rocsparse_status_invalid_pointer(status, "Error: csr_val is nullptr"); } // Testing for(handle == nullptr) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csr2hyb(handle_null, m, n, descr, csr_val, csr_row_ptr, csr_col_ind, hyb, 0, rocsparse_hyb_partition_auto); + status = rocsparse_csr2hyb(handle_null, + m, + n, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + hyb, + 0, + rocsparse_hyb_partition_auto); verify_rocsparse_status_invalid_handle(status); } } @@ -136,11 +172,12 @@ rocsparse_status testing_csr2hyb(Arguments argus) rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto csr_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - auto csr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto csr_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); - T* csr_val = (T*)csr_val_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); if(!csr_row_ptr || !csr_col_ind || !csr_val) { @@ -149,7 +186,8 @@ rocsparse_status testing_csr2hyb(Arguments argus) return rocsparse_status_memory_error; } - status = rocsparse_csr2hyb(handle, m, n, descr, csr_val, csr_row_ptr, csr_col_ind, hyb, user_ell_width, part); + status = rocsparse_csr2hyb( + handle, m, n, descr, csr_val, csr_row_ptr, csr_col_ind, hyb, user_ell_width, part); if(m < 0 || n < 0) { @@ -197,7 +235,7 @@ rocsparse_status testing_csr2hyb(Arguments argus) rocsparse_int* dcsr_row_ptr = (rocsparse_int*)dcsr_row_ptr_managed.get(); rocsparse_int* dcsr_col_ind = (rocsparse_int*)dcsr_col_ind_managed.get(); - T* dcsr_val = (T*)dcsr_val_managed.get(); + T* dcsr_val = (T*)dcsr_val_managed.get(); if(!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val) { @@ -211,8 +249,7 @@ rocsparse_status testing_csr2hyb(Arguments argus) dcsr_row_ptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy( dcsr_col_ind, hcsr_col_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy( - dcsr_val, hcsr_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_val, hcsr_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); // User given ELL width check if(part == rocsparse_hyb_partition_user) @@ -227,9 +264,19 @@ rocsparse_status testing_csr2hyb(Arguments argus) rocsparse_int max_allowed_ell_nnz_per_row = (2 * nnz - 1) / m + 1; if(user_ell_width < 0 || user_ell_width > max_allowed_ell_nnz_per_row) { - status = rocsparse_csr2hyb(handle, m, n, descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, hyb, user_ell_width, part); - - verify_rocsparse_status_invalid_value(status, "Error: user_ell_width < 0 || user_ell_width > max_ell_width"); + status = rocsparse_csr2hyb(handle, + m, + n, + descr, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + hyb, + user_ell_width, + part); + + verify_rocsparse_status_invalid_value( + status, "Error: user_ell_width < 0 || user_ell_width > max_ell_width"); return rocsparse_status_success; } @@ -244,8 +291,8 @@ rocsparse_status testing_csr2hyb(Arguments argus) // Host csr2hyb conversion rocsparse_int ell_width = 0; - rocsparse_int ell_nnz = 0; - rocsparse_int coo_nnz = 0; + rocsparse_int ell_nnz = 0; + rocsparse_int coo_nnz = 0; if(part == rocsparse_hyb_partition_auto || part == rocsparse_hyb_partition_user) { @@ -278,7 +325,7 @@ rocsparse_status testing_csr2hyb(Arguments argus) for(rocsparse_int i = 0; i < m; ++i) { rocsparse_int row_nnz = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; - ell_width = (row_nnz > ell_width) ? row_nnz : ell_width; + ell_width = (row_nnz > ell_width) ? row_nnz : ell_width; } ell_nnz = ell_width * m; } @@ -301,30 +348,26 @@ rocsparse_status testing_csr2hyb(Arguments argus) { if(p < ell_width) { - rocsparse_int idx = ELL_IND(i, p++, m, ell_width); + rocsparse_int idx = ELL_IND(i, p++, m, ell_width); hhyb_ell_col_ind_gold[idx] = hcsr_col_ind[j] - idx_base; - hhyb_ell_val_gold[idx] = hcsr_val[j]; + hhyb_ell_val_gold[idx] = hcsr_val[j]; } else { hhyb_coo_row_ind_gold[coo_idx] = i; hhyb_coo_col_ind_gold[coo_idx] = hcsr_col_ind[j] - idx_base; - hhyb_coo_val_gold[coo_idx] = hcsr_val[j]; + hhyb_coo_val_gold[coo_idx] = hcsr_val[j]; ++coo_idx; } } for(rocsparse_int j = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; j < ell_width; ++j) { - rocsparse_int idx = ELL_IND(i, p++, m, ell_width); + rocsparse_int idx = ELL_IND(i, p++, m, ell_width); hhyb_ell_col_ind_gold[idx] = -1; - hhyb_ell_val_gold[idx] = static_cast(0); + hhyb_ell_val_gold[idx] = static_cast(0); } } - - - - // Allocate verification structures std::vector hhyb_ell_col_ind(ell_nnz); std::vector hhyb_ell_val(ell_nnz); @@ -334,10 +377,11 @@ rocsparse_status testing_csr2hyb(Arguments argus) if(argus.unit_check) { - CHECK_ROCSPARSE_ERROR(rocsparse_csr2hyb(handle, m, n, descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, hyb, user_ell_width, part)); + CHECK_ROCSPARSE_ERROR(rocsparse_csr2hyb( + handle, m, n, descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, hyb, user_ell_width, part)); // Copy output from device to host - test_hyb *dhyb = (test_hyb*)hyb; + test_hyb* dhyb = (test_hyb*)hyb; // Check if sizes match unit_check_general(1, 1, &m, &dhyb->m); @@ -346,11 +390,22 @@ rocsparse_status testing_csr2hyb(Arguments argus) unit_check_general(1, 1, &ell_nnz, &dhyb->ell_nnz); unit_check_general(1, 1, &coo_nnz, &dhyb->coo_nnz); - CHECK_HIP_ERROR(hipMemcpy(hhyb_ell_col_ind.data(), dhyb->ell_col_ind, sizeof(rocsparse_int) * ell_nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hhyb_ell_val.data(), dhyb->ell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hhyb_coo_row_ind.data(), dhyb->coo_row_ind, sizeof(rocsparse_int) * coo_nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hhyb_coo_col_ind.data(), dhyb->coo_col_ind, sizeof(rocsparse_int) * coo_nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hhyb_coo_val.data(), dhyb->coo_val, sizeof(T) * coo_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hhyb_ell_col_ind.data(), + dhyb->ell_col_ind, + sizeof(rocsparse_int) * ell_nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + hhyb_ell_val.data(), dhyb->ell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hhyb_coo_row_ind.data(), + dhyb->coo_row_ind, + sizeof(rocsparse_int) * coo_nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hhyb_coo_col_ind.data(), + dhyb->coo_col_ind, + sizeof(rocsparse_int) * coo_nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + hhyb_coo_val.data(), dhyb->coo_val, sizeof(T) * coo_nnz, hipMemcpyDeviceToHost)); // Unit check unit_check_general(1, ell_nnz, hhyb_ell_col_ind_gold.data(), hhyb_ell_col_ind.data()); @@ -360,38 +415,32 @@ rocsparse_status testing_csr2hyb(Arguments argus) unit_check_general(1, coo_nnz, hhyb_coo_val_gold.data(), hhyb_coo_val.data()); } + /* + if(argus.timing) + { + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_csr2hyb(handle, dcsr_row_ptr, nnz, m, dhyb_row_ind, idx_base); + } + double gpu_time_used = get_time_us(); + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_csr2hyb(handle, dcsr_row_ptr, nnz, m, dhyb_row_ind, idx_base); + } + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + double bandwidth = sizeof(rocsparse_int) * (nnz + m + 1) / gpu_time_used / 1e6; -/* - if(argus.timing) - { - rocsparse_int number_cold_calls = 2; - rocsparse_int number_hot_calls = argus.iters; - - for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) - { - rocsparse_csr2hyb(handle, dcsr_row_ptr, nnz, m, dhyb_row_ind, idx_base); - } - - double gpu_time_used = get_time_us(); - - for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) - { - rocsparse_csr2hyb(handle, dcsr_row_ptr, nnz, m, dhyb_row_ind, idx_base); + printf("m\t\tn\t\tnnz\t\tGB/s\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\n", m, n, nnz, bandwidth, gpu_time_used); } - - gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); - - double bandwidth = sizeof(rocsparse_int) * (nnz + m + 1) / gpu_time_used / 1e6; - - printf("m\t\tn\t\tnnz\t\tGB/s\tmsec\n"); - printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\n", m, n, nnz, bandwidth, gpu_time_used); - } -*/ + */ return rocsparse_status_success; } diff --git a/clients/include/testing_hybmv.hpp b/clients/include/testing_hybmv.hpp index c6bee286..8a6c4249 100644 --- a/clients/include/testing_hybmv.hpp +++ b/clients/include/testing_hybmv.hpp @@ -35,11 +35,11 @@ void testing_hybmv_bad_arg(void) std::unique_ptr unique_ptr_hyb(new hyb_struct); rocsparse_hyb_mat hyb = unique_ptr_hyb->hyb; - auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; - T* dx = (T*)dx_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx || !dy) { diff --git a/clients/tests/test_csr2hyb.cpp b/clients/tests/test_csr2hyb.cpp index 49d15867..2d4e653b 100644 --- a/clients/tests/test_csr2hyb.cpp +++ b/clients/tests/test_csr2hyb.cpp @@ -17,9 +17,8 @@ int csr2hyb_N_range[] = {-3, 0, 33, 242, 623, 1000}; rocsparse_index_base csr2hyb_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; -rocsparse_hyb_partition csr2hyb_partition[] = {rocsparse_hyb_partition_auto, - rocsparse_hyb_partition_max, - rocsparse_hyb_partition_user}; +rocsparse_hyb_partition csr2hyb_partition[] = { + rocsparse_hyb_partition_auto, rocsparse_hyb_partition_max, rocsparse_hyb_partition_user}; int csr2hyb_ELL_range[] = {-33, -1, 0, INT32_MAX}; @@ -40,7 +39,7 @@ Arguments setup_csr2hyb_arguments(csr2hyb_tuple tup) arg.idx_base = std::get<2>(tup); arg.part = std::get<3>(tup); arg.ell_width = std::get<4>(tup); - arg.timing = 0; + arg.timing = 0; return arg; } diff --git a/clients/tests/test_hybmv.cpp b/clients/tests/test_hybmv.cpp index db4376bd..a61619ec 100644 --- a/clients/tests/test_hybmv.cpp +++ b/clients/tests/test_hybmv.cpp @@ -9,7 +9,8 @@ #include #include -typedef std::tuple hybmv_tuple; +typedef std::tuple + hybmv_tuple; int hyb_M_range[] = {-1, 0, 10, 500, 7111, 10000}; int hyb_N_range[] = {-3, 0, 33, 842, 4441, 10000}; @@ -19,9 +20,8 @@ std::vector hyb_beta_range = {0.0, 1.0}; rocsparse_index_base hyb_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; -rocsparse_hyb_partition hyb_partition[] = {rocsparse_hyb_partition_auto, - rocsparse_hyb_partition_max, - rocsparse_hyb_partition_user}; +rocsparse_hyb_partition hyb_partition[] = { + rocsparse_hyb_partition_auto, rocsparse_hyb_partition_max, rocsparse_hyb_partition_user}; int hyb_ELL_range[] = {-33, -1, 0, INT32_MAX}; @@ -44,7 +44,7 @@ Arguments setup_hybmv_arguments(hybmv_tuple tup) arg.idx_base = std::get<4>(tup); arg.part = std::get<5>(tup); arg.ell_width = std::get<6>(tup); - arg.timing = 0; + arg.timing = 0; return arg; } diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index a7394c87..a423fcb0 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -27,8 +27,11 @@ __device__ void sum_reduce(rocsparse_int tid, rocsparse_int* data) } template -__global__ void -hyb_coo_nnz_part1(rocsparse_int m, rocsparse_int ell_width, const rocsparse_int* csr_row_ptr, rocsparse_int* workspace, rocsparse_int* coo_row_nnz) +__global__ void hyb_coo_nnz_part1(rocsparse_int m, + rocsparse_int ell_width, + const rocsparse_int* csr_row_ptr, + rocsparse_int* workspace, + rocsparse_int* coo_row_nnz) { rocsparse_int tid = hipThreadIdx_x; rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -39,7 +42,7 @@ hyb_coo_nnz_part1(rocsparse_int m, rocsparse_int ell_width, const rocsparse_int* { rocsparse_int row_nnz = csr_row_ptr[gid + 1] - csr_row_ptr[gid]; - if (row_nnz > ell_width) + if(row_nnz > ell_width) { row_nnz = row_nnz - ell_width; sdata[tid] = row_nnz; @@ -204,11 +207,11 @@ __global__ void csr2ell_kernel(rocsparse_int m, rocsparse_int row_begin = csr_row_ptr[ai] - idx_base; rocsparse_int row_end = csr_row_ptr[ai + 1] - idx_base; rocsparse_int coo_idx = coo_row_ind ? workspace[ai] - idx_base : 0; - + // Fill HYB matrix for(rocsparse_int aj = row_begin; aj < row_end; ++aj) { - if (p < ell_width) + if(p < ell_width) { rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); ell_col_ind[idx] = csr_col_ind[aj] - idx_base; @@ -218,7 +221,7 @@ __global__ void csr2ell_kernel(rocsparse_int m, { coo_row_ind[coo_idx] = ai; coo_col_ind[coo_idx] = csr_col_ind[aj] - idx_base; - coo_val[coo_idx] = csr_val[aj]; + coo_val[coo_idx] = csr_val[aj]; ++coo_idx; } } diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp index ccccefe6..098faf90 100644 --- a/library/src/conversion/rocsparse_csr2hyb.cpp +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -61,7 +61,9 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, return rocsparse_status_not_implemented; } // Check partition type - if(partition_type != rocsparse_hyb_partition_max && partition_type != rocsparse_hyb_partition_user && partition_type != rocsparse_hyb_partition_auto) + if(partition_type != rocsparse_hyb_partition_max && + partition_type != rocsparse_hyb_partition_user && + partition_type != rocsparse_hyb_partition_auto) { return rocsparse_status_invalid_value; } @@ -98,7 +100,8 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, // Get number of CSR non-zeros rocsparse_int csr_nnz; - RETURN_IF_HIP_ERROR(hipMemcpy(&csr_nnz, csr_row_ptr+m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR( + hipMemcpy(&csr_nnz, csr_row_ptr + m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); // Correct by index base csr_nnz -= descr->base; @@ -151,7 +154,7 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, RETURN_IF_HIP_ERROR(hipFree(hyb->coo_val)); } - // Determine ELL width +// Determine ELL width #define CSR2ELL_DIM 512 // Workspace size @@ -200,7 +203,8 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, // Allocate ELL part if(hyb->ell_nnz > 0) { - RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->ell_col_ind, sizeof(rocsparse_int) * hyb->ell_nnz)); + RETURN_IF_HIP_ERROR( + hipMalloc((void**)&hyb->ell_col_ind, sizeof(rocsparse_int) * hyb->ell_nnz)); RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T) * hyb->ell_nnz)); } @@ -215,7 +219,8 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, if(hyb->ell_nnz == 0) { hyb->coo_nnz = csr_nnz; - RETURN_IF_HIP_ERROR(hipMemcpy(workspace2, csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy( + workspace2, csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToDevice)); } else { @@ -242,36 +247,33 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hipMemcpy(&hyb->coo_nnz, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); // Perform exclusive scan on workspace TODO use rocPRIM - std::vector hbuf(m+1); - RETURN_IF_HIP_ERROR(hipMemcpy(hbuf.data() + 1, workspace2, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); + std::vector hbuf(m + 1); + RETURN_IF_HIP_ERROR(hipMemcpy( + hbuf.data() + 1, workspace2, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); hbuf[0] = descr->base; - for (rocsparse_int i = 0; i < m; ++i) + for(rocsparse_int i = 0; i < m; ++i) { - hbuf[i+1] += hbuf[i]; + hbuf[i + 1] += hbuf[i]; } - RETURN_IF_HIP_ERROR(hipMemcpy(workspace2, hbuf.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy( + workspace2, hbuf.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); } - } - RETURN_IF_HIP_ERROR(hipFree(workspace)); - - // Allocate COO part if(hyb->coo_nnz > 0) { - RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->coo_row_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); - RETURN_IF_HIP_ERROR(hipMalloc((void**)&hyb->coo_col_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); + RETURN_IF_HIP_ERROR( + hipMalloc((void**)&hyb->coo_row_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); + RETURN_IF_HIP_ERROR( + hipMalloc((void**)&hyb->coo_col_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); RETURN_IF_HIP_ERROR(hipMalloc(&hyb->coo_val, sizeof(T) * hyb->coo_nnz)); } - - - dim3 csr2ell_blocks((m - 1) / CSR2ELL_DIM + 1); dim3 csr2ell_threads(CSR2ELL_DIM); @@ -293,8 +295,6 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, workspace2, descr->base); - - RETURN_IF_HIP_ERROR(hipFree(workspace2)); #undef CSR2ELL_DIM diff --git a/library/src/level2/rocsparse_hybmv.cpp b/library/src/level2/rocsparse_hybmv.cpp index d5ce0f36..e1ceaf7e 100644 --- a/library/src/level2/rocsparse_hybmv.cpp +++ b/library/src/level2/rocsparse_hybmv.cpp @@ -128,11 +128,11 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, // TODO return rocsparse_status_not_implemented; } -// TODO check partition type -// if(hyb->partition != rocsparse_hyb_partition_max) -// { -// return rocsparse_status_not_implemented; -// } + // TODO check partition type + // if(hyb->partition != rocsparse_hyb_partition_max) + // { + // return rocsparse_status_not_implemented; + // } // Check sizes if(hyb->m < 0) @@ -251,23 +251,24 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, { // TODO #define COOMVN_DIM 128 - rocsparse_int maxthreads = handle->properties.maxThreadsPerBlock; - rocsparse_int nprocs = handle->properties.multiProcessorCount; - rocsparse_int maxblocks = (nprocs * maxthreads - 1) / COOMVN_DIM + 1; - rocsparse_int minblocks = (hyb->coo_nnz - 1) / COOMVN_DIM + 1; + rocsparse_int maxthreads = handle->properties.maxThreadsPerBlock; + rocsparse_int nprocs = handle->properties.multiProcessorCount; + rocsparse_int maxblocks = (nprocs * maxthreads - 1) / COOMVN_DIM + 1; + rocsparse_int minblocks = (hyb->coo_nnz - 1) / COOMVN_DIM + 1; - rocsparse_int nblocks = maxblocks < minblocks ? maxblocks : minblocks; - rocsparse_int nwarps = nblocks * (COOMVN_DIM / handle->warp_size); - rocsparse_int nloops = (hyb->coo_nnz / handle->warp_size + 1) / nwarps + 1; + rocsparse_int nblocks = maxblocks < minblocks ? maxblocks : minblocks; + rocsparse_int nwarps = nblocks * (COOMVN_DIM / handle->warp_size); + rocsparse_int nloops = (hyb->coo_nnz / handle->warp_size + 1) / nwarps + 1; - dim3 coomvn_blocks(nblocks); - dim3 coomvn_threads(COOMVN_DIM); + dim3 coomvn_blocks(nblocks); + dim3 coomvn_threads(COOMVN_DIM); - rocsparse_int* row_block_red = NULL; - T* val_block_red = NULL; + rocsparse_int* row_block_red = NULL; + T* val_block_red = NULL; - RETURN_IF_HIP_ERROR(hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwarps)); - RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwarps)); + RETURN_IF_HIP_ERROR( + hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwarps)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwarps)); hipLaunchKernelGGL((coomvn_warp_host_pointer), coomvn_blocks, @@ -286,27 +287,19 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, val_block_red, descr->base); - hipLaunchKernelGGL((coomvn_general_block_reduce), - dim3(1), - coomvn_threads, - 0, - stream, - nwarps, - row_block_red, - val_block_red, - y); + hipLaunchKernelGGL((coomvn_general_block_reduce), + dim3(1), + coomvn_threads, + 0, + stream, + nwarps, + row_block_red, + val_block_red, + y); - RETURN_IF_HIP_ERROR(hipFree(row_block_red)); - RETURN_IF_HIP_ERROR(hipFree(val_block_red)); + RETURN_IF_HIP_ERROR(hipFree(row_block_red)); + RETURN_IF_HIP_ERROR(hipFree(val_block_red)); #undef COOMVN_DIM - - - - - - - - } } #undef ELLMVN_DIM From 69b08c7c272d39468b6b60a7ddcc7bc410323344 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 09:13:21 +0200 Subject: [PATCH 090/304] hyb matrix: index_base_one support --- clients/include/testing_csr2hyb.hpp | 6 +++--- library/src/conversion/csr2hyb_device.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clients/include/testing_csr2hyb.hpp b/clients/include/testing_csr2hyb.hpp index e0d9f372..54b4efaf 100644 --- a/clients/include/testing_csr2hyb.hpp +++ b/clients/include/testing_csr2hyb.hpp @@ -349,13 +349,13 @@ rocsparse_status testing_csr2hyb(Arguments argus) if(p < ell_width) { rocsparse_int idx = ELL_IND(i, p++, m, ell_width); - hhyb_ell_col_ind_gold[idx] = hcsr_col_ind[j] - idx_base; + hhyb_ell_col_ind_gold[idx] = hcsr_col_ind[j]; hhyb_ell_val_gold[idx] = hcsr_val[j]; } else { - hhyb_coo_row_ind_gold[coo_idx] = i; - hhyb_coo_col_ind_gold[coo_idx] = hcsr_col_ind[j] - idx_base; + hhyb_coo_row_ind_gold[coo_idx] = i + idx_base; + hhyb_coo_col_ind_gold[coo_idx] = hcsr_col_ind[j]; hhyb_coo_val_gold[coo_idx] = hcsr_val[j]; ++coo_idx; } diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index a423fcb0..e0a05d10 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -214,13 +214,13 @@ __global__ void csr2ell_kernel(rocsparse_int m, if(p < ell_width) { rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); - ell_col_ind[idx] = csr_col_ind[aj] - idx_base; + ell_col_ind[idx] = csr_col_ind[aj]; ell_val[idx] = csr_val[aj]; } else { - coo_row_ind[coo_idx] = ai; - coo_col_ind[coo_idx] = csr_col_ind[aj] - idx_base; + coo_row_ind[coo_idx] = ai + idx_base; + coo_col_ind[coo_idx] = csr_col_ind[aj]; coo_val[coo_idx] = csr_val[aj]; ++coo_idx; } From b6f317b7ba015b01a57ac93e87c4398b6a643e47 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 09:15:07 +0200 Subject: [PATCH 091/304] moved level2 routines templates into header files --- library/src/include/definitions.h | 9 + library/src/level2/rocsparse_coomv.cpp | 402 +--------------- library/src/level2/rocsparse_coomv.hpp | 412 ++++++++++++++++ library/src/level2/rocsparse_csrmv.cpp | 626 +----------------------- library/src/level2/rocsparse_csrmv.hpp | 636 +++++++++++++++++++++++++ library/src/level2/rocsparse_hybmv.cpp | 309 +----------- library/src/level2/rocsparse_hybmv.hpp | 324 +++++++++++++ 7 files changed, 1384 insertions(+), 1334 deletions(-) create mode 100644 library/src/level2/rocsparse_coomv.hpp create mode 100644 library/src/level2/rocsparse_csrmv.hpp create mode 100644 library/src/level2/rocsparse_hybmv.hpp diff --git a/library/src/include/definitions.h b/library/src/include/definitions.h index ea888ffa..6776ad20 100644 --- a/library/src/include/definitions.h +++ b/library/src/include/definitions.h @@ -23,6 +23,15 @@ } \ } +#define RETURN_IF_ROCSPARSE_ERROR(INPUT_STATUS_FOR_CHECK) \ + { \ + rocsparse_status TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ + if(TMP_STATUS_FOR_CHECK != rocsparse_status_success) \ + { \ + return TMP_STATUS_FOR_CHECK; \ + } \ + } + #define THROW_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK) \ { \ hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \ diff --git a/library/src/level2/rocsparse_coomv.cpp b/library/src/level2/rocsparse_coomv.cpp index ac10e2ad..041a9368 100644 --- a/library/src/level2/rocsparse_coomv.cpp +++ b/library/src/level2/rocsparse_coomv.cpp @@ -3,407 +3,7 @@ * ************************************************************************ */ #include "rocsparse.h" -#include "definitions.h" -#include "handle.h" -#include "utility.h" -#include "coomv_device.h" - -#include - -template -__global__ void coomvn_warp_host_pointer(rocsparse_int nnz, - rocsparse_int loops, - T alpha, - const rocsparse_int* coo_row_ind, - const rocsparse_int* coo_col_ind, - const T* coo_val, - const T* x, - T* y, - rocsparse_int* row_block_red, - T* val_block_red, - rocsparse_index_base idx_base) -{ - coomvn_general_warp_reduce(nnz, - loops, - alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - idx_base); -} - -template -__global__ void coomvn_warp_device_pointer(rocsparse_int nnz, - rocsparse_int loops, - const T* alpha, - const rocsparse_int* coo_row_ind, - const rocsparse_int* coo_col_ind, - const T* coo_val, - const T* x, - T* y, - rocsparse_int* row_block_red, - T* val_block_red, - rocsparse_index_base idx_base) -{ - coomvn_general_warp_reduce(nnz, - loops, - *alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - idx_base); -} - -/*! \brief SPARSE Level 2 API - - \details - coomv multiplies the dense vector x[i] with scalar alpha and sparse m x n - matrix A that is defined in COO storage format and add the result to y[i] - that is multiplied by beta, for i = 1 , … , n - - y := alpha * op(A) * x + beta * y, - - @param[in] - handle rocsparse_handle. - handle to the rocsparse library context queue. - @param[in] - trans operation type of A. - @param[in] - m number of rows of A. - @param[in] - n number of columns of A. - @param[in] - nnz number of non-zero entries of A. - @param[in] - alpha scalar alpha. - @param[in] - descr descriptor of A. - @param[in] - coo_val array of nnz elements of A. - @param[in] - coo_row_ind array of nnz elements containing the row indices of A. - @param[in] - coo_col_ind array of nnz elements containing the column indices of A. - @param[in] - x array of n elements (op(A) = A) or m elements (op(A) = A^T or - op(A) = A^H). - @param[in] - beta scalar beta. - @param[inout] - y array of m elements (op(A) = A) or n elements (op(A) = A^T or - op(A) = A^H). - - ********************************************************************/ -template -rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const T* alpha, - const rocsparse_mat_descr descr, - const T* coo_val, - const rocsparse_int* coo_row_ind, - const rocsparse_int* coo_col_ind, - const T* x, - const T* beta, - T* y) -{ - // Check for valid handle and matrix descriptor - if(handle == nullptr) - { - return rocsparse_status_invalid_handle; - } - else if(descr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Logging TODO bench logging - if(handle->pointer_mode == rocsparse_pointer_mode_host) - { - log_trace(handle, - replaceX("rocsparse_Xcoomv"), - trans, - m, - n, - nnz, - *alpha, - (const void*&)descr, - (const void*&)coo_val, - (const void*&)coo_row_ind, - (const void*&)coo_col_ind, - (const void*&)x, - *beta, - (const void*&)y); - } - else - { - log_trace(handle, - replaceX("rocsparse_Xcoomv"), - trans, - m, - n, - nnz, - (const void*&)alpha, - (const void*&)descr, - (const void*&)coo_val, - (const void*&)coo_row_ind, - (const void*&)coo_col_ind, - (const void*&)x, - (const void*&)beta, - (const void*&)y); - } - - // Check index base - if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) - { - return rocsparse_status_invalid_value; - } - // Check matrix type - if(descr->type != rocsparse_matrix_type_general) - { - // TODO - return rocsparse_status_not_implemented; - } - - // Check sizes - if(m < 0) - { - return rocsparse_status_invalid_size; - } - else if(n < 0) - { - return rocsparse_status_invalid_size; - } - else if(nnz < 0) - { - return rocsparse_status_invalid_size; - } - - // Check pointer arguments - if(coo_val == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(coo_row_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(coo_col_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(x == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(y == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(alpha == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(beta == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Quick return if possible - if(m == 0 || n == 0 || nnz == 0) - { - return rocsparse_status_success; - } - - // Stream - hipStream_t stream = handle->stream; - - // Run different coomv kernels - if(trans == rocsparse_operation_none) - { -#define COOMVN_DIM 128 - rocsparse_int maxthreads = handle->properties.maxThreadsPerBlock; - rocsparse_int nprocs = handle->properties.multiProcessorCount; - rocsparse_int maxblocks = (nprocs * maxthreads - 1) / COOMVN_DIM + 1; - rocsparse_int minblocks = (nnz - 1) / COOMVN_DIM + 1; - - rocsparse_int nblocks = maxblocks < minblocks ? maxblocks : minblocks; - rocsparse_int nwarps = nblocks * (COOMVN_DIM / handle->warp_size); - rocsparse_int nloops = (nnz / handle->warp_size + 1) / nwarps + 1; - - dim3 coomvn_blocks(nblocks); - dim3 coomvn_threads(COOMVN_DIM); - - rocsparse_int* row_block_red = NULL; - T* val_block_red = NULL; - - RETURN_IF_HIP_ERROR(hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwarps)); - RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwarps)); - - if(handle->pointer_mode == rocsparse_pointer_mode_device) - { - // We need a host copy of beta to avoid unneccessary kernel launch - T h_beta; - RETURN_IF_HIP_ERROR(hipMemcpy(&h_beta, beta, sizeof(T), hipMemcpyDeviceToHost)); - - if(h_beta == static_cast(0)) - { - RETURN_IF_HIP_ERROR(hipMemset(y, 0, sizeof(T) * m)); - } - else if(h_beta != static_cast(1)) - { - hipLaunchKernelGGL((coomv_scale), - dim3((m - 1) / COOMVN_DIM + 1), - coomvn_threads, - 0, - stream, - m, - h_beta, - y); - } - - if(handle->warp_size == 32) - { - hipLaunchKernelGGL((coomvn_warp_device_pointer), - coomvn_blocks, - coomvn_threads, - 0, - stream, - nnz, - nloops, - alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - descr->base); - } - else if(handle->warp_size == 64) - { - hipLaunchKernelGGL((coomvn_warp_device_pointer), - coomvn_blocks, - coomvn_threads, - 0, - stream, - nnz, - nloops, - alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - descr->base); - } - else - { - return rocsparse_status_arch_mismatch; - } - } - else - { - if(*alpha == static_cast(0) && *beta == static_cast(1)) - { - return rocsparse_status_success; - } - - // If beta == 0.0 we need to set y to 0 - if(*beta == static_cast(0)) - { - RETURN_IF_HIP_ERROR(hipMemset(y, 0, sizeof(T) * m)); - } - else if(*beta != static_cast(1)) - { - hipLaunchKernelGGL((coomv_scale), - dim3((m - 1) / COOMVN_DIM + 1), - coomvn_threads, - 0, - stream, - m, - *beta, - y); - } - - if(handle->warp_size == 32) - { - hipLaunchKernelGGL((coomvn_warp_host_pointer), - coomvn_blocks, - coomvn_threads, - 0, - stream, - nnz, - nloops, - *alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - descr->base); - } - else if(handle->warp_size == 64) - { - hipLaunchKernelGGL((coomvn_warp_host_pointer), - coomvn_blocks, - coomvn_threads, - 0, - stream, - nnz, - nloops, - *alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - descr->base); - } - else - { - return rocsparse_status_arch_mismatch; - } - } - - hipLaunchKernelGGL((coomvn_general_block_reduce), - dim3(1), - coomvn_threads, - 0, - stream, - nwarps, - row_block_red, - val_block_red, - y); - - RETURN_IF_HIP_ERROR(hipFree(row_block_red)); - RETURN_IF_HIP_ERROR(hipFree(val_block_red)); -#undef COOMVN_DIM - } - else - { - // TODO - return rocsparse_status_not_implemented; - } - return rocsparse_status_success; -} +#include "rocsparse_coomv.hpp" /* * =========================================================================== diff --git a/library/src/level2/rocsparse_coomv.hpp b/library/src/level2/rocsparse_coomv.hpp new file mode 100644 index 00000000..dbc43f2d --- /dev/null +++ b/library/src/level2/rocsparse_coomv.hpp @@ -0,0 +1,412 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_COOMV_HPP +#define ROCSPARSE_COOMV_HPP + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "coomv_device.h" + +#include + +template +__global__ void coomvn_warp_host_pointer(rocsparse_int nnz, + rocsparse_int loops, + T alpha, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* coo_val, + const T* x, + T* y, + rocsparse_int* row_block_red, + T* val_block_red, + rocsparse_index_base idx_base) +{ + coomvn_general_warp_reduce(nnz, + loops, + alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + idx_base); +} + +template +__global__ void coomvn_warp_device_pointer(rocsparse_int nnz, + rocsparse_int loops, + const T* alpha, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* coo_val, + const T* x, + T* y, + rocsparse_int* row_block_red, + T* val_block_red, + rocsparse_index_base idx_base) +{ + coomvn_general_warp_reduce(nnz, + loops, + *alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + idx_base); +} + +/*! \brief SPARSE Level 2 API + + \details + coomv multiplies the dense vector x[i] with scalar alpha and sparse m x n + matrix A that is defined in COO storage format and add the result to y[i] + that is multiplied by beta, for i = 1 , … , n + + y := alpha * op(A) * x + beta * y, + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + trans operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + alpha scalar alpha. + @param[in] + descr descriptor of A. + @param[in] + coo_val array of nnz elements of A. + @param[in] + coo_row_ind array of nnz elements containing the row indices of A. + @param[in] + coo_col_ind array of nnz elements containing the column indices of A. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ +template +rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* coo_val, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* x, + const T* beta, + T* y) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xcoomv"), + trans, + m, + n, + nnz, + *alpha, + (const void*&)descr, + (const void*&)coo_val, + (const void*&)coo_row_ind, + (const void*&)coo_col_ind, + (const void*&)x, + *beta, + (const void*&)y); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xcoomv"), + trans, + m, + n, + nnz, + (const void*&)alpha, + (const void*&)descr, + (const void*&)coo_val, + (const void*&)coo_row_ind, + (const void*&)coo_col_ind, + (const void*&)x, + (const void*&)beta, + (const void*&)y); + } + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + // Check matrix type + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(coo_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(coo_row_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(coo_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(beta == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // Run different coomv kernels + if(trans == rocsparse_operation_none) + { +#define COOMVN_DIM 128 + rocsparse_int maxthreads = handle->properties.maxThreadsPerBlock; + rocsparse_int nprocs = handle->properties.multiProcessorCount; + rocsparse_int maxblocks = (nprocs * maxthreads - 1) / COOMVN_DIM + 1; + rocsparse_int minblocks = (nnz - 1) / COOMVN_DIM + 1; + + rocsparse_int nblocks = maxblocks < minblocks ? maxblocks : minblocks; + rocsparse_int nwarps = nblocks * (COOMVN_DIM / handle->warp_size); + rocsparse_int nloops = (nnz / handle->warp_size + 1) / nwarps + 1; + + dim3 coomvn_blocks(nblocks); + dim3 coomvn_threads(COOMVN_DIM); + + rocsparse_int* row_block_red = NULL; + T* val_block_red = NULL; + + RETURN_IF_HIP_ERROR(hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwarps)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwarps)); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + // We need a host copy of beta to avoid unneccessary kernel launch + T h_beta; + RETURN_IF_HIP_ERROR(hipMemcpy(&h_beta, beta, sizeof(T), hipMemcpyDeviceToHost)); + + if(h_beta == static_cast(0)) + { + RETURN_IF_HIP_ERROR(hipMemset(y, 0, sizeof(T) * m)); + } + else if(h_beta != static_cast(1)) + { + hipLaunchKernelGGL((coomv_scale), + dim3((m - 1) / COOMVN_DIM + 1), + coomvn_threads, + 0, + stream, + m, + h_beta, + y); + } + + if(handle->warp_size == 32) + { + hipLaunchKernelGGL((coomvn_warp_device_pointer), + coomvn_blocks, + coomvn_threads, + 0, + stream, + nnz, + nloops, + alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + descr->base); + } + else if(handle->warp_size == 64) + { + hipLaunchKernelGGL((coomvn_warp_device_pointer), + coomvn_blocks, + coomvn_threads, + 0, + stream, + nnz, + nloops, + alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + descr->base); + } + else + { + return rocsparse_status_arch_mismatch; + } + } + else + { + if(*alpha == static_cast(0) && *beta == static_cast(1)) + { + return rocsparse_status_success; + } + + // If beta == 0.0 we need to set y to 0 + if(*beta == static_cast(0)) + { + RETURN_IF_HIP_ERROR(hipMemset(y, 0, sizeof(T) * m)); + } + else if(*beta != static_cast(1)) + { + hipLaunchKernelGGL((coomv_scale), + dim3((m - 1) / COOMVN_DIM + 1), + coomvn_threads, + 0, + stream, + m, + *beta, + y); + } + + if(handle->warp_size == 32) + { + hipLaunchKernelGGL((coomvn_warp_host_pointer), + coomvn_blocks, + coomvn_threads, + 0, + stream, + nnz, + nloops, + *alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + descr->base); + } + else if(handle->warp_size == 64) + { + hipLaunchKernelGGL((coomvn_warp_host_pointer), + coomvn_blocks, + coomvn_threads, + 0, + stream, + nnz, + nloops, + *alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + descr->base); + } + else + { + return rocsparse_status_arch_mismatch; + } + } + + hipLaunchKernelGGL((coomvn_general_block_reduce), + dim3(1), + coomvn_threads, + 0, + stream, + nwarps, + row_block_red, + val_block_red, + y); + + RETURN_IF_HIP_ERROR(hipFree(row_block_red)); + RETURN_IF_HIP_ERROR(hipFree(val_block_red)); +#undef COOMVN_DIM + } + else + { + // TODO + return rocsparse_status_not_implemented; + } + return rocsparse_status_success; +} + +#endif // ROCSPARSE_COOMV_HPP diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 411a36cd..f796eb5a 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -3,631 +3,7 @@ * ************************************************************************ */ #include "rocsparse.h" -#include "handle.h" -#include "utility.h" -#include "csrmv_device.h" - -#include - -template -__global__ void csrmvn_kernel_host_pointer(rocsparse_int m, - T alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* x, - T beta, - T* y, - rocsparse_index_base idx_base) -{ - csrmvn_general_device( - m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); -} - -template -__global__ void csrmvn_kernel_device_pointer(rocsparse_int m, - const T* alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* x, - const T* beta, - T* y, - rocsparse_index_base idx_base) -{ - csrmvn_general_device( - m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); -} - -/*! \brief SPARSE Level 2 API - - \details - csrmv multiplies the dense vector x[i] with scalar alpha and sparse m x n - matrix A that is defined in CSR storage format and add the result to y[i] - that is multiplied by beta, for i = 1 , … , n - - y := alpha * op(A) * x + beta * y, - - @param[in] - handle rocsparse_handle. - handle to the rocsparse library context queue. - @param[in] - trans operation type of A. - @param[in] - m number of rows of A. - @param[in] - n number of columns of A. - @param[in] - nnz number of non-zero entries of A. - @param[in] - alpha scalar alpha. - @param[in] - descr descriptor of A. - @param[in] - csr_val array of nnz elements of A. - @param[in] - csr_row_ptr array of m+1 elements that point to the start - of every row of A. - @param[in] - csr_col_ind array of nnz elements containing the column indices of A. - @param[in] - x array of n elements (op(A) = A) or m elements (op(A) = A^T or - op(A) = A^H). - @param[in] - beta scalar beta. - @param[inout] - y array of m elements (op(A) = A) or n elements (op(A) = A^T or - op(A) = A^H). - - ********************************************************************/ -template -rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const T* alpha, - const rocsparse_mat_descr descr, - const T* csr_val, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* x, - const T* beta, - T* y) -{ - // Check for valid handle and matrix descriptor - if(handle == nullptr) - { - return rocsparse_status_invalid_handle; - } - else if(descr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Logging TODO bench logging - if(handle->pointer_mode == rocsparse_pointer_mode_host) - { - log_trace(handle, - replaceX("rocsparse_Xcsrmv"), - trans, - m, - n, - nnz, - *alpha, - (const void*&)descr, - (const void*&)csr_val, - (const void*&)csr_row_ptr, - (const void*&)csr_col_ind, - (const void*&)x, - *beta, - (const void*&)y); - } - else - { - log_trace(handle, - replaceX("rocsparse_Xcsrmv"), - trans, - m, - n, - nnz, - (const void*&)alpha, - (const void*&)descr, - (const void*&)csr_val, - (const void*&)csr_row_ptr, - (const void*&)csr_col_ind, - (const void*&)x, - (const void*&)beta, - (const void*&)y); - } - - // Check index base - if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) - { - return rocsparse_status_invalid_value; - } - if(descr->type != rocsparse_matrix_type_general) - { - // TODO - return rocsparse_status_not_implemented; - } - - // Check sizes - if(m < 0) - { - return rocsparse_status_invalid_size; - } - else if(n < 0) - { - return rocsparse_status_invalid_size; - } - else if(nnz < 0) - { - return rocsparse_status_invalid_size; - } - - // Check pointer arguments - if(csr_val == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(csr_row_ptr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(csr_col_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(x == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(y == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(alpha == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(beta == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Quick return if possible - if(m == 0 || n == 0 || nnz == 0) - { - return rocsparse_status_success; - } - - // Stream - hipStream_t stream = handle->stream; - - // Run different csrmv kernels - if(trans == rocsparse_operation_none) - { -#define CSRMVN_DIM 512 - rocsparse_int nnz_per_row = nnz / m; - - dim3 csrmvn_blocks((m - 1) / CSRMVN_DIM + 1); - dim3 csrmvn_threads(CSRMVN_DIM); - - if(handle->pointer_mode == rocsparse_pointer_mode_device) - { - if(handle->warp_size == 32) - { - if(nnz_per_row < 4) - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else if(nnz_per_row < 8) - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else if(nnz_per_row < 16) - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else if(nnz_per_row < 32) - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - } - else if(handle->warp_size == 64) - { - if(nnz_per_row < 4) - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else if(nnz_per_row < 8) - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else if(nnz_per_row < 16) - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else if(nnz_per_row < 32) - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else if(nnz_per_row < 64) - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else - { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - } - else - { - return rocsparse_status_arch_mismatch; - } - } - else - { - if(*alpha == 0.0 && *beta == 1.0) - { - return rocsparse_status_success; - } - - if(handle->warp_size == 32) - { - if(nnz_per_row < 4) - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - else if(nnz_per_row < 8) - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - else if(nnz_per_row < 16) - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - else if(nnz_per_row < 32) - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - else - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - } - else if(handle->warp_size == 64) - { - if(nnz_per_row < 4) - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - else if(nnz_per_row < 8) - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - else if(nnz_per_row < 16) - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - else if(nnz_per_row < 32) - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - else if(nnz_per_row < 64) - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - else - { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), - csrmvn_blocks, - csrmvn_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - } - else - { - return rocsparse_status_arch_mismatch; - } - } -#undef CSRMVN_DIM - } - else - { - // TODO - return rocsparse_status_not_implemented; - } - return rocsparse_status_success; -} +#include "rocsparse_csrmv.hpp" /* * =========================================================================== diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp new file mode 100644 index 00000000..8853a7fe --- /dev/null +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -0,0 +1,636 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_CSRMV_HPP +#define ROCSPARSE_CSRMV_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "csrmv_device.h" + +#include + +template +__global__ void csrmvn_kernel_host_pointer(rocsparse_int m, + T alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* x, + T beta, + T* y, + rocsparse_index_base idx_base) +{ + csrmvn_general_device( + m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); +} + +template +__global__ void csrmvn_kernel_device_pointer(rocsparse_int m, + const T* alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* x, + const T* beta, + T* y, + rocsparse_index_base idx_base) +{ + csrmvn_general_device( + m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); +} + +/*! \brief SPARSE Level 2 API + + \details + csrmv multiplies the dense vector x[i] with scalar alpha and sparse m x n + matrix A that is defined in CSR storage format and add the result to y[i] + that is multiplied by beta, for i = 1 , … , n + + y := alpha * op(A) * x + beta * y, + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + trans operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + alpha scalar alpha. + @param[in] + descr descriptor of A. + @param[in] + csr_val array of nnz elements of A. + @param[in] + csr_row_ptr array of m+1 elements that point to the start + of every row of A. + @param[in] + csr_col_ind array of nnz elements containing the column indices of A. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ +template +rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* x, + const T* beta, + T* y) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xcsrmv"), + trans, + m, + n, + nnz, + *alpha, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)x, + *beta, + (const void*&)y); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xcsrmv"), + trans, + m, + n, + nnz, + (const void*&)alpha, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)x, + (const void*&)beta, + (const void*&)y); + } + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(beta == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // Run different csrmv kernels + if(trans == rocsparse_operation_none) + { +#define CSRMVN_DIM 512 + rocsparse_int nnz_per_row = nnz / m; + + dim3 csrmvn_blocks((m - 1) / CSRMVN_DIM + 1); + dim3 csrmvn_threads(CSRMVN_DIM); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + if(handle->warp_size == 32) + { + if(nnz_per_row < 4) + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else if(nnz_per_row < 8) + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else if(nnz_per_row < 16) + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else if(nnz_per_row < 32) + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + } + else if(handle->warp_size == 64) + { + if(nnz_per_row < 4) + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else if(nnz_per_row < 8) + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else if(nnz_per_row < 16) + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else if(nnz_per_row < 32) + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else if(nnz_per_row < 64) + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else + { + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + } + else + { + return rocsparse_status_arch_mismatch; + } + } + else + { + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + + if(handle->warp_size == 32) + { + if(nnz_per_row < 4) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + else if(nnz_per_row < 8) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + else if(nnz_per_row < 16) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + else if(nnz_per_row < 32) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + else + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + } + else if(handle->warp_size == 64) + { + if(nnz_per_row < 4) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + else if(nnz_per_row < 8) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + else if(nnz_per_row < 16) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + else if(nnz_per_row < 32) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + else if(nnz_per_row < 64) + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + else + { + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + } + else + { + return rocsparse_status_arch_mismatch; + } + } +#undef CSRMVN_DIM + } + else + { + // TODO + return rocsparse_status_not_implemented; + } + return rocsparse_status_success; +} + +#endif // ROCSPARSE_CSRMV_HPP diff --git a/library/src/level2/rocsparse_hybmv.cpp b/library/src/level2/rocsparse_hybmv.cpp index e1ceaf7e..933748eb 100644 --- a/library/src/level2/rocsparse_hybmv.cpp +++ b/library/src/level2/rocsparse_hybmv.cpp @@ -3,314 +3,7 @@ * ************************************************************************ */ #include "rocsparse.h" -#include "definitions.h" -#include "handle.h" -#include "utility.h" -#include "coomv_device.h" -#include "ellmv_device.h" - -#include - -template -__global__ void ellmvn_kernel_host_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int ell_width, - T alpha, - const rocsparse_int* ell_col_ind, - const T* ell_val, - const T* x, - T beta, - T* y, - rocsparse_index_base idx_base) -{ - ellmvn_device(m, n, ell_width, alpha, ell_col_ind, ell_val, x, beta, y, idx_base); -} - -template -__global__ void ellmvn_kernel_device_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int ell_width, - const T* alpha, - const rocsparse_int* ell_col_ind, - const T* ell_val, - const T* x, - const T* beta, - T* y, - rocsparse_index_base idx_base) -{ - ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y, idx_base); -} - -template -__global__ void coomvn_warp_host_pointer(rocsparse_int nnz, - rocsparse_int loops, - T alpha, - const rocsparse_int* coo_row_ind, - const rocsparse_int* coo_col_ind, - const T* coo_val, - const T* x, - T* y, - rocsparse_int* row_block_red, - T* val_block_red, - rocsparse_index_base idx_base) -{ - coomvn_general_warp_reduce(nnz, - loops, - alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - idx_base); -} - -template -rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, - rocsparse_operation trans, - const T* alpha, - const rocsparse_mat_descr descr, - const rocsparse_hyb_mat hyb, - const T* x, - const T* beta, - T* y) -{ - // Check for valid handle and matrix descriptor - if(handle == nullptr) - { - return rocsparse_status_invalid_handle; - } - else if(descr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(hyb == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Logging TODO bench logging - if(handle->pointer_mode == rocsparse_pointer_mode_host) - { - log_trace(handle, - replaceX("rocsparse_Xhybmv"), - trans, - *alpha, - (const void*&)descr, - (const void*&)hyb, - (const void*&)x, - *beta, - (const void*&)y); - } - else - { - log_trace(handle, - replaceX("rocsparse_Xhybmv"), - trans, - (const void*&)alpha, - (const void*&)descr, - (const void*&)hyb, - (const void*&)x, - (const void*&)beta, - (const void*&)y); - } - - // Check index base - if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) - { - return rocsparse_status_invalid_value; - } - // Check matrix type - if(descr->type != rocsparse_matrix_type_general) - { - // TODO - return rocsparse_status_not_implemented; - } - // TODO check partition type - // if(hyb->partition != rocsparse_hyb_partition_max) - // { - // return rocsparse_status_not_implemented; - // } - - // Check sizes - if(hyb->m < 0) - { - return rocsparse_status_invalid_size; - } - else if(hyb->n < 0) - { - return rocsparse_status_invalid_size; - } - else if(hyb->ell_nnz + hyb->coo_nnz < 0) - { - return rocsparse_status_invalid_size; - } - - // Check ELL-HYB structure - if(hyb->ell_nnz > 0) - { - if(hyb->ell_width < 0) - { - return rocsparse_status_invalid_size; - } - else if(hyb->ell_col_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(hyb->ell_val == nullptr) - { - return rocsparse_status_invalid_pointer; - } - } - - // Check COO-HYB structure - if(hyb->coo_nnz > 0) - { - if(hyb->coo_row_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(hyb->coo_col_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(hyb->coo_val == nullptr) - { - return rocsparse_status_invalid_pointer; - } - } - - // Check pointer arguments - if(x == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(y == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(alpha == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(beta == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Quick return if possible - if(hyb->m == 0 || hyb->n == 0 || hyb->ell_nnz + hyb->coo_nnz == 0) - { - return rocsparse_status_success; - } - - // Stream - hipStream_t stream = handle->stream; - - // Run different hybmv kernels - if(trans == rocsparse_operation_none) - { -#define ELLMVN_DIM 512 - dim3 ellmvn_blocks((hyb->m - 1) / ELLMVN_DIM + 1); - dim3 ellmvn_threads(ELLMVN_DIM); - - if(handle->pointer_mode == rocsparse_pointer_mode_device) - { - } - else - { - if(*alpha == 0.0 && *beta == 1.0) - { - return rocsparse_status_success; - } - - // ELL part - if(hyb->ell_nnz > 0) - { - hipLaunchKernelGGL((ellmvn_kernel_host_pointer), - ellmvn_blocks, - ellmvn_threads, - 0, - stream, - hyb->m, - hyb->n, - hyb->ell_width, - *alpha, - hyb->ell_col_ind, - (T*)hyb->ell_val, - x, - *beta, - y, - descr->base); - } - - // COO part - if(hyb->coo_nnz > 0) - { -// TODO -#define COOMVN_DIM 128 - rocsparse_int maxthreads = handle->properties.maxThreadsPerBlock; - rocsparse_int nprocs = handle->properties.multiProcessorCount; - rocsparse_int maxblocks = (nprocs * maxthreads - 1) / COOMVN_DIM + 1; - rocsparse_int minblocks = (hyb->coo_nnz - 1) / COOMVN_DIM + 1; - - rocsparse_int nblocks = maxblocks < minblocks ? maxblocks : minblocks; - rocsparse_int nwarps = nblocks * (COOMVN_DIM / handle->warp_size); - rocsparse_int nloops = (hyb->coo_nnz / handle->warp_size + 1) / nwarps + 1; - - dim3 coomvn_blocks(nblocks); - dim3 coomvn_threads(COOMVN_DIM); - - rocsparse_int* row_block_red = NULL; - T* val_block_red = NULL; - - RETURN_IF_HIP_ERROR( - hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwarps)); - RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwarps)); - - hipLaunchKernelGGL((coomvn_warp_host_pointer), - coomvn_blocks, - coomvn_threads, - 0, - stream, - hyb->coo_nnz, - nloops, - *alpha, - hyb->coo_row_ind, - hyb->coo_col_ind, - (T*)hyb->coo_val, - x, - y, - row_block_red, - val_block_red, - descr->base); - - hipLaunchKernelGGL((coomvn_general_block_reduce), - dim3(1), - coomvn_threads, - 0, - stream, - nwarps, - row_block_red, - val_block_red, - y); - - RETURN_IF_HIP_ERROR(hipFree(row_block_red)); - RETURN_IF_HIP_ERROR(hipFree(val_block_red)); -#undef COOMVN_DIM - } - } -#undef ELLMVN_DIM - } - else - { - // TODO - return rocsparse_status_not_implemented; - } - return rocsparse_status_success; -} +#include "rocsparse_hybmv.hpp" /* * =========================================================================== diff --git a/library/src/level2/rocsparse_hybmv.hpp b/library/src/level2/rocsparse_hybmv.hpp new file mode 100644 index 00000000..1a5ef94c --- /dev/null +++ b/library/src/level2/rocsparse_hybmv.hpp @@ -0,0 +1,324 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_HYBMV_HPP +#define ROCSPARSE_HYBMV_HPP + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "ellmv_device.h" +#include "rocsparse_coomv.hpp" + +#include + +template +__global__ void ellmvn_kernel_host_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + T alpha, + const rocsparse_int* ell_col_ind, + const T* ell_val, + const T* x, + T beta, + T* y, + rocsparse_index_base idx_base) +{ + ellmvn_device(m, n, ell_width, alpha, ell_col_ind, ell_val, x, beta, y, idx_base); +} + +template +__global__ void ellmvn_kernel_device_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + const T* alpha, + const rocsparse_int* ell_col_ind, + const T* ell_val, + const T* x, + const T* beta, + T* y, + rocsparse_index_base idx_base) +{ + ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y, idx_base); +} + +template +rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, + rocsparse_operation trans, + const T* alpha, + const rocsparse_mat_descr descr, + const rocsparse_hyb_mat hyb, + const T* x, + const T* beta, + T* y) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(hyb == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xhybmv"), + trans, + *alpha, + (const void*&)descr, + (const void*&)hyb, + (const void*&)x, + *beta, + (const void*&)y); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xhybmv"), + trans, + (const void*&)alpha, + (const void*&)descr, + (const void*&)hyb, + (const void*&)x, + (const void*&)beta, + (const void*&)y); + } + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + // Check matrix type + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + // Check partition type + if(hyb->partition != rocsparse_hyb_partition_max && + hyb->partition != rocsparse_hyb_partition_auto && + hyb->partition != rocsparse_hyb_partition_user) + { + return rocsparse_status_invalid_value; + } + + // Check sizes + if(hyb->m < 0) + { + return rocsparse_status_invalid_size; + } + else if(hyb->n < 0) + { + return rocsparse_status_invalid_size; + } + else if(hyb->ell_nnz + hyb->coo_nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check ELL-HYB structure + if(hyb->ell_nnz > 0) + { + if(hyb->ell_width < 0) + { + return rocsparse_status_invalid_size; + } + else if(hyb->ell_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(hyb->ell_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + } + + // Check COO-HYB structure + if(hyb->coo_nnz > 0) + { + if(hyb->coo_row_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(hyb->coo_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(hyb->coo_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + } + + // Check pointer arguments + if(x == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(beta == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(hyb->m == 0 || hyb->n == 0 || hyb->ell_nnz + hyb->coo_nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // Run different hybmv kernels + if(trans == rocsparse_operation_none) + { +#define ELLMVN_DIM 512 + dim3 ellmvn_blocks((hyb->m - 1) / ELLMVN_DIM + 1); + dim3 ellmvn_threads(ELLMVN_DIM); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + // ELL part + if(hyb->ell_nnz > 0) + { + hipLaunchKernelGGL((ellmvn_kernel_device_pointer), + ellmvn_blocks, + ellmvn_threads, + 0, + stream, + hyb->m, + hyb->n, + hyb->ell_width, + alpha, + hyb->ell_col_ind, + (T*)hyb->ell_val, + x, + beta, + y, + descr->base); + } + + // COO part + if(hyb->coo_nnz > 0) + { + // Beta is applied by ELL part, IF ell_nnz > 0 + if(hyb->ell_nnz > 0) + { + T one = static_cast(1); + T* coo_beta; + + RETURN_IF_HIP_ERROR(hipMalloc((void**)&coo_beta, sizeof(T))); + RETURN_IF_HIP_ERROR( + hipMemcpy(coo_beta, &one, sizeof(T), hipMemcpyHostToDevice)); + RETURN_IF_ROCSPARSE_ERROR(rocsparse_coomv_template(handle, + trans, + hyb->m, + hyb->n, + hyb->coo_nnz, + alpha, + descr, + (T*)hyb->coo_val, + hyb->coo_row_ind, + hyb->coo_col_ind, + x, + coo_beta, + y)); + RETURN_IF_HIP_ERROR(hipFree(coo_beta)); + } + else + { + RETURN_IF_ROCSPARSE_ERROR(rocsparse_coomv_template(handle, + trans, + hyb->m, + hyb->n, + hyb->coo_nnz, + alpha, + descr, + (T*)hyb->coo_val, + hyb->coo_row_ind, + hyb->coo_col_ind, + x, + beta, + y)); + } + } + } + else + { + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + + // ELL part + if(hyb->ell_nnz > 0) + { + hipLaunchKernelGGL((ellmvn_kernel_host_pointer), + ellmvn_blocks, + ellmvn_threads, + 0, + stream, + hyb->m, + hyb->n, + hyb->ell_width, + *alpha, + hyb->ell_col_ind, + (T*)hyb->ell_val, + x, + *beta, + y, + descr->base); + } + + // COO part + if(hyb->coo_nnz > 0) + { + // Beta is applied by ELL part, IF ell_nnz > 0 + T coo_beta = (hyb->ell_nnz > 0) ? 1.0 : *beta; + + RETURN_IF_ROCSPARSE_ERROR(rocsparse_coomv_template(handle, + trans, + hyb->m, + hyb->n, + hyb->coo_nnz, + alpha, + descr, + (T*)hyb->coo_val, + hyb->coo_row_ind, + hyb->coo_col_ind, + x, + &coo_beta, + y)); + } + } + } +#undef ELLMVN_DIM + else + { + // TODO + return rocsparse_status_not_implemented; + } + return rocsparse_status_success; +} + +#endif // ROCSPARSE_HYBMV_HPP From a409e82422829fe64cf053f2f5e70db00684dbd3 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 09:47:19 +0200 Subject: [PATCH 092/304] cuda clients/benchmarks --- clients/benchmarks/CMakeLists.txt | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt index 84997d2a..fe77054f 100644 --- a/clients/benchmarks/CMakeLists.txt +++ b/clients/benchmarks/CMakeLists.txt @@ -40,17 +40,27 @@ target_include_directories(rocsparse-bench $ ) -target_link_libraries(rocsparse-bench - PRIVATE - ${Boost_LIBRARIES} - roc::rocsparse - hip::hip_hcc - hip::hip_device -) +if(HIP_PLATFORM STREQUAL "hcc") + target_link_libraries(rocsparse-bench + PRIVATE + ${Boost_LIBRARIES} + roc::rocsparse + hip::hip_hcc + hip::hip_device + ) + + foreach(amdgpu_target ${AMDGPU_TARGETS}) + target_link_libraries(rocsparse-bench + PRIVATE + --amdgpu-target=${amdgpu_target} + ) + endforeach() +endif() -foreach(amdgpu_target ${AMDGPU_TARGETS}) +if(HIP_PLATFORM STREQUAL "nvcc") target_link_libraries(rocsparse-bench PRIVATE - --amdgpu-target=${amdgpu_target} + ${Boost_LIBRARIES} + roc::rocsparse ) -endforeach() +endif() From 035844b6d821bd832807e027513daf6d449d5c5b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 10:53:14 +0200 Subject: [PATCH 093/304] csr2hyb, hybmv: added tests --- clients/include/testing_csr2hyb.hpp | 103 +++++++---- clients/include/testing_hybmv.hpp | 272 ++++++++++++++++++++++++++++ clients/tests/test_hybmv.cpp | 4 +- 3 files changed, 343 insertions(+), 36 deletions(-) diff --git a/clients/include/testing_csr2hyb.hpp b/clients/include/testing_csr2hyb.hpp index 54b4efaf..4874826c 100644 --- a/clients/include/testing_csr2hyb.hpp +++ b/clients/include/testing_csr2hyb.hpp @@ -204,26 +204,46 @@ rocsparse_status testing_csr2hyb(Arguments argus) // For testing, assemble a COO matrix and convert it to CSR first (on host) // Host structures - std::vector hcoo_row_ind(nnz); - std::vector hcsr_col_ind(nnz); - std::vector hcsr_val(nnz); + std::vector hcsr_row_ptr; + std::vector hcoo_row_ind; + std::vector hcsr_col_ind; + std::vector hcsr_val; // Sample initial COO matrix on CPU srand(12345ULL); - gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base); - - // Convert COO to CSR - std::vector hcsr_row_ptr(m + 1); - - for(rocsparse_int i = 0; i < nnz; ++i) + if(argus.laplacian) { - ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base); + nnz = hcsr_row_ptr[m]; } - - hcsr_row_ptr[0] = idx_base; - for(rocsparse_int i = 0; i < m; ++i) + else { - hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + if(argus.filename != "") + { + if(read_mtx_matrix( + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base); + } + + // Convert COO to CSR + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } } // Allocate memory on the device @@ -415,32 +435,47 @@ rocsparse_status testing_csr2hyb(Arguments argus) unit_check_general(1, coo_nnz, hhyb_coo_val_gold.data(), hhyb_coo_val.data()); } - /* - if(argus.timing) - { - rocsparse_int number_cold_calls = 2; - rocsparse_int number_hot_calls = argus.iters; + if(argus.timing) + { + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; - for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) - { - rocsparse_csr2hyb(handle, dcsr_row_ptr, nnz, m, dhyb_row_ind, idx_base); - } + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_csr2hyb(handle, + m, + n, + descr, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + hyb, + user_ell_width, + part); + } - double gpu_time_used = get_time_us(); + double gpu_time_used = get_time_us(); - for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) - { - rocsparse_csr2hyb(handle, dcsr_row_ptr, nnz, m, dhyb_row_ind, idx_base); - } + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_csr2hyb(handle, + m, + n, + descr, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + hyb, + user_ell_width, + part); + } - gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); - double bandwidth = sizeof(rocsparse_int) * (nnz + m + 1) / gpu_time_used / 1e6; + printf("m\t\tn\t\tnnz\t\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\n", m, n, nnz, gpu_time_used); + } - printf("m\t\tn\t\tnnz\t\tGB/s\tmsec\n"); - printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\n", m, n, nnz, bandwidth, gpu_time_used); - } - */ return rocsparse_status_success; } diff --git a/clients/include/testing_hybmv.hpp b/clients/include/testing_hybmv.hpp index 8a6c4249..22b239ab 100644 --- a/clients/include/testing_hybmv.hpp +++ b/clients/include/testing_hybmv.hpp @@ -17,6 +17,21 @@ using namespace rocsparse; using namespace rocsparse_test; +struct testhyb +{ + rocsparse_int m; + rocsparse_int n; + rocsparse_hyb_partition partition; + rocsparse_int ell_nnz; + rocsparse_int ell_width; + rocsparse_int* ell_col_ind; + void* ell_val; + rocsparse_int coo_nnz; + rocsparse_int* coo_row_ind; + rocsparse_int* coo_col_ind; + void* coo_val; +}; + template void testing_hybmv_bad_arg(void) { @@ -101,6 +116,263 @@ void testing_hybmv_bad_arg(void) template rocsparse_status testing_hybmv(Arguments argus) { + rocsparse_int safe_size = 100; + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + T h_alpha = argus.alpha; + T h_beta = argus.beta; + rocsparse_operation trans = argus.trans; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_hyb_partition part = argus.part; + rocsparse_int user_ell_width = argus.ell_width; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + std::unique_ptr test_descr(new descr_struct); + rocsparse_mat_descr descr = test_descr->descr; + + // Set matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); + + std::unique_ptr test_hyb(new hyb_struct); + rocsparse_hyb_mat hyb = test_hyb->hyb; + + // Determine number of non-zero elements + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dval || !dptr || !dcol || !dx || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dptr || !dcol || !dval || !dx || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = + rocsparse_csr2hyb(handle, m, n, descr, dval, dptr, dcol, hyb, user_ell_width, part); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + + // hybmv should be able to deal with m <= 0 || n <= 0 || nnz <= 0 even if csr2hyb fails + // because hyb structures is allocated with n = m = 0 - so nothing should happen + status = rocsparse_hybmv(handle, trans, &h_alpha, descr, hyb, dx, &h_beta, dy); + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + + return rocsparse_status_success; + } + + // Host structures + std::vector hcsr_row_ptr; + std::vector hcoo_row_ind; + std::vector hcol_ind; + std::vector hval; + + // Initial Data on CPU + srand(12345ULL); + if(argus.laplacian) + { + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcol_ind, hval, idx_base); + nnz = hcsr_row_ptr[m]; + } + else + { + if(argus.filename != "") + { + if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval) != + 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base); + } + + // Convert COO to CSR + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } + } + + std::vector hx(n); + std::vector hy_1(m); + std::vector hy_2(m); + std::vector hy_gold(m); + + rocsparse_init(hx, 1, n); + rocsparse_init(hy_1, 1, m); + + // copy vector is easy in STL; hy_gold = hx: save a copy in hy_gold which will be output of CPU + hy_2 = hy_1; + hy_gold = hy_1; + + // allocate memory on device + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * n), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto d_beta_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy_1 = (T*)dy_1_managed.get(); + T* dy_2 = (T*)dy_2_managed.get(); + T* d_alpha = (T*)d_alpha_managed.get(); + T* d_beta = (T*)d_beta_managed.get(); + + if(!dval || !dptr || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dval || !dptr || !dcol || !dx || " + "!dy_1 || !dy_2 || !d_alpha || !d_beta"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy( + dptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dcol, hcol_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dval, hval.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * n, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); + + // User given ELL width + if(part == rocsparse_hyb_partition_user) + { + user_ell_width = user_ell_width * nnz / m; + } + + // Convert CSR to HYB + CHECK_ROCSPARSE_ERROR( + rocsparse_csr2hyb(handle, m, n, descr, dval, dptr, dcol, hyb, user_ell_width, part)); + + if(argus.unit_check) + { + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR( + rocsparse_hybmv(handle, trans, &h_alpha, descr, hyb, dx, &h_beta, dy_1)); + + // ROCSPARSE pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR( + rocsparse_hybmv(handle, trans, d_alpha, descr, hyb, dx, d_beta, dy_2)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * m, hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + for(rocsparse_int i = 0; i < m; ++i) + { + hy_gold[i] *= h_beta; + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; + ++j) + { + hy_gold[i] += h_alpha * hval[j] * hx[hcol_ind[j] - idx_base]; + } + } + + cpu_time_used = get_time_us() - cpu_time_used; + + unit_check_general(1, m, hy_gold.data(), hy_1.data()); + unit_check_general(1, m, hy_gold.data(), hy_2.data()); + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_hybmv(handle, trans, &h_alpha, descr, hyb, dx, &h_beta, dy_1); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_hybmv(handle, trans, &h_alpha, descr, hyb, dx, &h_beta, dy_1); + } + + testhyb* dhyb = (testhyb*)hyb; + + // Convert to miliseconds per call + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + size_t flops = (h_alpha != 1.0) ? 3.0 * nnz : 2.0 * nnz; + flops = (h_beta != 0.0) ? flops + m : flops; + double gpu_gflops = flops / gpu_time_used / 1e6; + size_t ell_mem = dhyb->ell_nnz * (sizeof(rocsparse_int) + sizeof(T)); + size_t coo_mem = dhyb->coo_nnz * (sizeof(rocsparse_int) * 2 + sizeof(T)); + size_t memtrans = 2 * m + ell_mem + coo_mem; + memtrans = (h_beta != 0.0) ? memtrans + m : memtrans; + double bandwidth = memtrans / gpu_time_used / 1e6; + + printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + m, + n, + nnz, + h_alpha, + h_beta, + gpu_gflops, + bandwidth, + gpu_time_used); + } + return rocsparse_status_success; } diff --git a/clients/tests/test_hybmv.cpp b/clients/tests/test_hybmv.cpp index a61619ec..cecf7747 100644 --- a/clients/tests/test_hybmv.cpp +++ b/clients/tests/test_hybmv.cpp @@ -16,14 +16,14 @@ int hyb_M_range[] = {-1, 0, 10, 500, 7111, 10000}; int hyb_N_range[] = {-3, 0, 33, 842, 4441, 10000}; std::vector hyb_alpha_range = {2.0, 3.0}; -std::vector hyb_beta_range = {0.0, 1.0}; +std::vector hyb_beta_range = {0.0, 0.67, 1.0}; rocsparse_index_base hyb_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; rocsparse_hyb_partition hyb_partition[] = { rocsparse_hyb_partition_auto, rocsparse_hyb_partition_max, rocsparse_hyb_partition_user}; -int hyb_ELL_range[] = {-33, -1, 0, INT32_MAX}; +int hyb_ELL_range[] = {0, 1, 2}; class parameterized_hybmv : public testing::TestWithParam { From 109b33211fef39ea46cbe05aecb60ee5827c04a8 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 10:53:34 +0200 Subject: [PATCH 094/304] csr2hyb, hybmv: added benchmarks --- clients/benchmarks/client.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 648cfea5..463f4788 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -6,8 +6,10 @@ #include "rocsparse.hpp" #include "testing_coomv.hpp" #include "testing_csrmv.hpp" +#include "testing_hybmv.hpp" #include "testing_axpyi.hpp" #include "testing_csr2coo.hpp" +#include "testing_csr2hyb.hpp" #include "testing_coo2csr.hpp" #include @@ -63,7 +65,8 @@ int main(int argc, char* argv[]) ("function,f", po::value(&function)->default_value("axpyi"), - "SPARSE function to test. Options: axpyi, coomv, csrmv, csr2coo, coo2csr") + "SPARSE function to test. Options: axpyi, coomv, csrmv, hybmv, csr2coo, csr2hyb, " + "coo2csr") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") @@ -139,10 +142,24 @@ int main(int argc, char* argv[]) else if(precision == 'd') testing_csrmv(argus); } + else if(function == "hybmv") + { + if(precision == 's') + testing_hybmv(argus); + else if(precision == 'd') + testing_hybmv(argus); + } else if(function == "csr2coo") { testing_csr2coo(argus); } + else if(function == "csr2hyb") + { + if(precision == 's') + testing_csr2hyb(argus); + else if(precision == 'd') + testing_csr2hyb(argus); + } else if(function == "coo2csr") { testing_coo2csr(argus); From 2444b73a2dd9205850b34542a9315435b56efc49 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 11:22:10 +0200 Subject: [PATCH 095/304] bugfix in sparse coo matrix generator - been running out of bounds --- clients/include/utility.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index 7e792d22..30e2f437 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -261,6 +261,10 @@ void gen_matrix_coo(rocsparse_int m, while(row_ind[i] == row_ind[begin]) { ++i; + if(i >= nnz) + { + break; + } } // Sample i disjunct column indices From 6456f465fba245735818d4ee078a2cf2af005746 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 11:32:34 +0200 Subject: [PATCH 096/304] switched jenkinsfile to run nightly tests 10pm austin time --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index f008822d..6009f66a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4,7 +4,7 @@ // Mostly generated from snippet generator 'properties; set job properties' // Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM properties([ - pipelineTriggers([cron('0 3 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]), + pipelineTriggers([cron('0 22 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]), buildDiscarder(logRotator( artifactDaysToKeepStr: '', artifactNumToKeepStr: '', From 1d15f6ae741820734e383d9d9656efbb3122c55c Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 11:47:48 +0200 Subject: [PATCH 097/304] updated README.md --- CMakeLists.txt | 2 +- README.md | 43 +++++++++++++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 70573371..fb84607c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,7 +42,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) # Build options option(BUILD_SHARED_LIBS "Build rocSPARSE as a shared library" ON) option(BUILD_CLIENTS_TESTS "Build tests (requires googletest)" OFF) -option(BUILD_CLIENTS_BENCHMARKS "Build benchmarks (requires googlebenchmark)" OFF) +option(BUILD_CLIENTS_BENCHMARKS "Build benchmarks (requires boost)" OFF) option(BUILD_CLIENTS_SAMPLES "Build examples" ON) option(BUILD_VERBOSE "Output additional build information" OFF) diff --git a/README.md b/README.md index d7681912..5fb8112a 100644 --- a/README.md +++ b/README.md @@ -28,12 +28,12 @@ cd rocSPARSE; mkdir build; cd build # Configure rocSPARSE # Build options: -# BUILD_TEST - build tests using [GTest][] (OFF) -# BUILD_BENCHMARK - build benchmarks using [Google Benchmark][] (OFF) -# BUILD_EXAMPLE - build examples (ON) -# BUILD_VERBOSE - verbose output (OFF) -# BUILD_SHARED_LIBS - build rocSPARSE as a shared library (ON) -cmake -DBUILD_TEST=ON .. +# BUILD_CLIENTS_TESTS - build tests using [GTest][] (OFF) +# BUILD_CLIENTS_BENCHMARKS - build benchmarks (OFF) +# BUILD_CLIENTS_SAMPLES - build examples (ON) +# BUILD_VERBOSE - verbose output (OFF) +# BUILD_SHARED_LIBS - build rocSPARSE as a shared library (ON) +cmake -DBUILD_CLIENTS_TESTS=ON .. # Build make @@ -42,24 +42,44 @@ make [sudo] make install ``` +#### Install script +You can also build rocSPARSE using the *install.sh* script +``` +# Clone rocSPARSE using git +git clone https://github.com/ROCmSoftwarePlatform/rocSparse.git + +# Go to rocSPARSE directory +cd rocSPARSE + +# Run install.sh script +# Command line options: +# -h|--help - prints help message +# -i|--install - install after build +# -d|--dependencies - install build dependencies +# -c|--clients - build library clients too (combines with -i & -d) +# -g|--debug - build with debug flag +# --cuda - build library for cuda backend +./install.sh -dci +``` + ## Unit tests -To run unit tests, rocSPARSE has to be built with option -DBUILD_TEST=ON. +To run unit tests, rocSPARSE has to be built with option -DBUILD_CLIENTS_TESTS=ON. ``` # Go to rocSPARSE build directory cd rocSPARSE; cd build # Run all tests -ctest +./clients/tests/rocsparse-test ``` ## Benchmarks -To run benchmarks, rocSPARSE has to be built with option -DBUILD_BENCHMARK=ON. +To run benchmarks, rocSPARSE has to be built with option -DBUILD_CLIENTS_BENCHMARKS=ON. ``` # Go to rocSPARSE build directory cd rocSPARSE/build -# Run benchmark -./benchmark/benchmark_csrmv +# Run benchmark, e.g. +./clients/benchmarks/rocsparse-bench -f hybmv --laplacian-dim 2000 -i 200 ``` ## Support @@ -73,6 +93,5 @@ The [license file][] can be found in the main repository. [ROCm]: https://github.com/RadeonOpenCompute/ROCm [HIP]: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/ [GTest]: https://github.com/google/googletest -[Google Benchmark]: https://github.com/google/benchmark [the issue tracker]: https://github.com/ROCmSoftwarePlatform/rocSparse/issues [license file]: ./LICENSE.md From 56bff0e1e5e31e356620a5cece0f0cd0fee29405 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 11:52:22 +0200 Subject: [PATCH 098/304] bugfix: error in vector length of xxxmv samples --- clients/samples/example_coomv.cpp | 8 ++++---- clients/samples/example_csrmv.cpp | 8 ++++---- clients/samples/example_ellmv.cpp | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/clients/samples/example_coomv.cpp b/clients/samples/example_coomv.cpp index 25e89791..ec5cf2c4 100644 --- a/clients/samples/example_coomv.cpp +++ b/clients/samples/example_coomv.cpp @@ -67,8 +67,8 @@ int main(int argc, char* argv[]) double halpha = static_cast(rand()) / RAND_MAX; double hbeta = 0.0; - std::vector hx(m); - rocsparse_init(hx, 1, m); + std::vector hx(n); + rocsparse_init(hx, 1, n); // Matrix descriptor rocsparse_mat_descr descrA; @@ -84,13 +84,13 @@ int main(int argc, char* argv[]) hipMalloc((void**)&dArow, sizeof(int) * nnz); hipMalloc((void**)&dAcol, sizeof(int) * nnz); hipMalloc((void**)&dAval, sizeof(double) * nnz); - hipMalloc((void**)&dx, sizeof(double) * m); + hipMalloc((void**)&dx, sizeof(double) * n); hipMalloc((void**)&dy, sizeof(double) * m); hipMemcpy(dArow, hArow.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(double) * m, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double) * n, hipMemcpyHostToDevice); // Warm up for(int i = 0; i < 10; ++i) diff --git a/clients/samples/example_csrmv.cpp b/clients/samples/example_csrmv.cpp index 6c6749ca..6c5421f4 100644 --- a/clients/samples/example_csrmv.cpp +++ b/clients/samples/example_csrmv.cpp @@ -56,8 +56,8 @@ int main(int argc, char* argv[]) double halpha = static_cast(rand()) / RAND_MAX; double hbeta = 0.0; - std::vector hx(m); - rocsparse_init(hx, 1, m); + std::vector hx(n); + rocsparse_init(hx, 1, n); // Matrix descriptor rocsparse_mat_descr descrA; @@ -73,13 +73,13 @@ int main(int argc, char* argv[]) hipMalloc((void**)&dAptr, sizeof(int) * (m + 1)); hipMalloc((void**)&dAcol, sizeof(int) * nnz); hipMalloc((void**)&dAval, sizeof(double) * nnz); - hipMalloc((void**)&dx, sizeof(double) * m); + hipMalloc((void**)&dx, sizeof(double) * n); hipMalloc((void**)&dy, sizeof(double) * m); hipMemcpy(dAptr, hAptr.data(), sizeof(int) * (m + 1), hipMemcpyHostToDevice); hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(double) * m, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double) * n, hipMemcpyHostToDevice); // Warm up for(int i = 0; i < 10; ++i) diff --git a/clients/samples/example_ellmv.cpp b/clients/samples/example_ellmv.cpp index c0445c2f..2cf296b2 100644 --- a/clients/samples/example_ellmv.cpp +++ b/clients/samples/example_ellmv.cpp @@ -56,8 +56,8 @@ int main(int argc, char* argv[]) double halpha = static_cast(rand()) / RAND_MAX; double hbeta = 0.0; - std::vector hx(m); - rocsparse_init(hx, 1, m); + std::vector hx(n); + rocsparse_init(hx, 1, n); // Matrix descriptor rocsparse_mat_descr descrA; @@ -73,13 +73,13 @@ int main(int argc, char* argv[]) hipMalloc((void**)&dAptr, sizeof(int) * (m + 1)); hipMalloc((void**)&dAcol, sizeof(int) * nnz); hipMalloc((void**)&dAval, sizeof(double) * nnz); - hipMalloc((void**)&dx, sizeof(double) * m); + hipMalloc((void**)&dx, sizeof(double) * n); hipMalloc((void**)&dy, sizeof(double) * m); hipMemcpy(dAptr, hAptr.data(), sizeof(int) * (m + 1), hipMemcpyHostToDevice); hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(double) * m, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double) * n, hipMemcpyHostToDevice); // Convert CSR matrix to HYB format, using partition type to be // rocsparse_hyb_partition_max. This will result in ELL matrix format, From 118edffb0f7cb3234d2ded11e426bcdce426914a Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 11:52:46 +0200 Subject: [PATCH 099/304] hybmv sample added --- clients/samples/CMakeLists.txt | 1 + clients/samples/example_hybmv.cpp | 145 ++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100644 clients/samples/example_hybmv.cpp diff --git a/clients/samples/CMakeLists.txt b/clients/samples/CMakeLists.txt index 8a92e5f8..36a600b3 100644 --- a/clients/samples/CMakeLists.txt +++ b/clients/samples/CMakeLists.txt @@ -47,3 +47,4 @@ add_rocsparse_example(example_handle.cpp) add_rocsparse_example(example_coomv.cpp) add_rocsparse_example(example_csrmv.cpp) add_rocsparse_example(example_ellmv.cpp) +add_rocsparse_example(example_hybmv.cpp) diff --git a/clients/samples/example_hybmv.cpp b/clients/samples/example_hybmv.cpp new file mode 100644 index 00000000..2bdcec6b --- /dev/null +++ b/clients/samples/example_hybmv.cpp @@ -0,0 +1,145 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "utility.hpp" + +#include +#include +#include +#include + +int main(int argc, char* argv[]) +{ + // Parse command line + if(argc < 2) + { + fprintf(stderr, "%s [ ]\n", argv[0]); + return -1; + } + + int ndim = atoi(argv[1]); + int trials = 200; + int batch_size = 1; + + if(argc > 2) + { + trials = atoi(argv[2]); + } + if(argc > 3) + { + batch_size = atoi(argv[3]); + } + + // rocSPARSE handle + rocsparse_handle handle; + rocsparse_create_handle(&handle); + + hipDeviceProp_t devProp; + int device_id = 0; + + hipGetDevice(&device_id); + hipGetDeviceProperties(&devProp, device_id); + printf("Device: %s\n", devProp.name); + + // Generate problem in CSR format + std::vector hAptr; + std::vector hAcol; + std::vector hAval; + int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); + int n = m; + int nnz = hAptr[m]; + + // Sample some random data + srand(12345ULL); + + double halpha = static_cast(rand()) / RAND_MAX; + double hbeta = 0.0; + + std::vector hx(n); + rocsparse_init(hx, 1, n); + + // Matrix descriptor + rocsparse_mat_descr descrA; + rocsparse_create_mat_descr(&descrA); + + // Offload data to device + int* dAptr = NULL; + int* dAcol = NULL; + double* dAval = NULL; + double* dx = NULL; + double* dy = NULL; + + hipMalloc((void**)&dAptr, sizeof(int) * (m + 1)); + hipMalloc((void**)&dAcol, sizeof(int) * nnz); + hipMalloc((void**)&dAval, sizeof(double) * nnz); + hipMalloc((void**)&dx, sizeof(double) * n); + hipMalloc((void**)&dy, sizeof(double) * m); + + hipMemcpy(dAptr, hAptr.data(), sizeof(int) * (m + 1), hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dx, hx.data(), sizeof(double) * n, hipMemcpyHostToDevice); + + // Convert CSR matrix to HYB format + rocsparse_hyb_mat hybA; + rocsparse_create_hyb_mat(&hybA); + + rocsparse_dcsr2hyb( + handle, m, n, descrA, dAval, dAptr, dAcol, hybA, 0, rocsparse_hyb_partition_auto); + + // Clean up CSR structures + hipFree(dAptr); + hipFree(dAcol); + hipFree(dAval); + + // Warm up + for(int i = 0; i < 10; ++i) + { + // Call rocsparse hybmv + rocsparse_dhybmv(handle, rocsparse_operation_none, &halpha, descrA, hybA, dx, &hbeta, dy); + } + + // Device synchronization + hipDeviceSynchronize(); + + // Start time measurement + double time = get_time_us(); + + // HYB matrix vector multiplication + for(int i = 0; i < trials; ++i) + { + for(int i = 0; i < batch_size; ++i) + { + // Call rocsparse hybmv + rocsparse_dhybmv( + handle, rocsparse_operation_none, &halpha, descrA, hybA, dx, &hbeta, dy); + } + + // Device synchronization + hipDeviceSynchronize(); + } + + time = (get_time_us() - time) / (trials * batch_size * 1e3); + double bandwidth = + static_cast(sizeof(double) * (2 * m + nnz) + sizeof(rocsparse_int) * (nnz)) / time / + 1e6; + double gflops = static_cast(2 * nnz) / time / 1e6; + printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tusec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + m, + n, + nnz, + halpha, + hbeta, + gflops, + bandwidth, + time); + + // Clear up on device + rocsparse_destroy_hyb_mat(hybA); + rocsparse_destroy_mat_descr(descrA); + rocsparse_destroy_handle(handle); + + return 0; +} From d266a6831972660f1ff3b2af7539a72ee982db59 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 12:34:31 +0200 Subject: [PATCH 100/304] moved templates to header --- library/src/conversion/rocsparse_csr2hyb.cpp | 298 +----------------- library/src/conversion/rocsparse_csr2hyb.hpp | 308 +++++++++++++++++++ library/src/level1/axpyi_device.h | 29 ++ library/src/level1/rocsparse_axpyi.cpp | 185 +---------- library/src/level1/rocsparse_axpyi.hpp | 178 +++++++++++ 5 files changed, 517 insertions(+), 481 deletions(-) create mode 100644 library/src/conversion/rocsparse_csr2hyb.hpp create mode 100644 library/src/level1/axpyi_device.h create mode 100644 library/src/level1/rocsparse_axpyi.hpp diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp index 098faf90..891a321c 100644 --- a/library/src/conversion/rocsparse_csr2hyb.cpp +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -3,303 +3,7 @@ * ************************************************************************ */ #include "rocsparse.h" -#include "definitions.h" -#include "handle.h" -#include "utility.h" -#include "csr2hyb_device.h" - -#include - -template -rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, - rocsparse_int m, - rocsparse_int n, - const rocsparse_mat_descr descr, - const T* csr_val, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_hyb_mat hyb, - rocsparse_int user_ell_width, - rocsparse_hyb_partition partition_type) -{ - // Check for valid handle and matrix descriptor - if(handle == nullptr) - { - return rocsparse_status_invalid_handle; - } - else if(descr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(hyb == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Logging TODO bench logging - log_trace(handle, - replaceX("rocsparse_Xcsr2hyb"), - m, - n, - (const void*&)descr, - (const void*&)csr_val, - (const void*&)csr_row_ptr, - (const void*&)csr_col_ind, - (const void*&)hyb, - user_ell_width, - partition_type); - - // Check index base - if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) - { - return rocsparse_status_invalid_value; - } - // Check matrix type - if(descr->type != rocsparse_matrix_type_general) - { - // TODO - return rocsparse_status_not_implemented; - } - // Check partition type - if(partition_type != rocsparse_hyb_partition_max && - partition_type != rocsparse_hyb_partition_user && - partition_type != rocsparse_hyb_partition_auto) - { - return rocsparse_status_invalid_value; - } - - // Check sizes - if(m < 0) - { - return rocsparse_status_invalid_size; - } - else if(n < 0) - { - return rocsparse_status_invalid_size; - } - - // Check pointer arguments - if(csr_val == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(csr_row_ptr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(csr_col_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Quick return if possible - if(m == 0 || n == 0) - { - return rocsparse_status_success; - } - - // Get number of CSR non-zeros - rocsparse_int csr_nnz; - RETURN_IF_HIP_ERROR( - hipMemcpy(&csr_nnz, csr_row_ptr + m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - - // Correct by index base - csr_nnz -= descr->base; - - // Check user_ell_width - if(partition_type == rocsparse_hyb_partition_user) - { - // ELL width cannot be 0 or negative - if(user_ell_width < 0) - { - return rocsparse_status_invalid_value; - } - - rocsparse_int max_row_nnz = (2 * csr_nnz - 1) / m + 1; - if(user_ell_width > max_row_nnz) - { - return rocsparse_status_invalid_value; - } - } - - // Stream - hipStream_t stream = handle->stream; - - // Clear HYB structure if already allocated - hyb->m = m; - hyb->n = n; - hyb->partition = partition_type; - hyb->ell_nnz = 0; - hyb->ell_width = 0; - hyb->coo_nnz = 0; - - if(hyb->ell_col_ind) - { - RETURN_IF_HIP_ERROR(hipFree(hyb->ell_col_ind)); - } - if(hyb->ell_val) - { - RETURN_IF_HIP_ERROR(hipFree(hyb->ell_val)); - } - if(hyb->coo_row_ind) - { - RETURN_IF_HIP_ERROR(hipFree(hyb->coo_row_ind)); - } - if(hyb->coo_col_ind) - { - RETURN_IF_HIP_ERROR(hipFree(hyb->coo_col_ind)); - } - if(hyb->coo_val) - { - RETURN_IF_HIP_ERROR(hipFree(hyb->coo_val)); - } - -// Determine ELL width - -#define CSR2ELL_DIM 512 - // Workspace size - rocsparse_int blocks = (m - 1) / CSR2ELL_DIM + 1; - // Allocate workspace - rocsparse_int* workspace = NULL; - RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(rocsparse_int) * blocks)); - - if(partition_type == rocsparse_hyb_partition_user) - { - // ELL width given by user - hyb->ell_width = user_ell_width; - } - else if(partition_type == rocsparse_hyb_partition_auto) - { - // ELL width determined by average nnz per row - hyb->ell_width = (csr_nnz - 1) / m + 1; - } - else - { - // HYB == ELL - no COO part - compute maximum nnz per row - hipLaunchKernelGGL((ell_width_kernel_part1), - dim3(blocks), - dim3(CSR2ELL_DIM), - 0, - stream, - m, - csr_row_ptr, - workspace); - - hipLaunchKernelGGL((ell_width_kernel_part2), - dim3(1), - dim3(CSR2ELL_DIM), - 0, - stream, - blocks, - workspace); - // Copy ell width back to host - RETURN_IF_HIP_ERROR( - hipMemcpy(&hyb->ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - } - - // Compute ELL non-zeros - hyb->ell_nnz = hyb->ell_width * m; - - // Allocate ELL part - if(hyb->ell_nnz > 0) - { - RETURN_IF_HIP_ERROR( - hipMalloc((void**)&hyb->ell_col_ind, sizeof(rocsparse_int) * hyb->ell_nnz)); - RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T) * hyb->ell_nnz)); - } - - // Allocate workspace2 - rocsparse_int* workspace2 = NULL; - RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace2, sizeof(rocsparse_int) * (m + 1))); - - // If there is a COO part, compute the COO non-zero elements per row - if(partition_type != rocsparse_hyb_partition_max) - { - // If there is no ELL part, its easy... - if(hyb->ell_nnz == 0) - { - hyb->coo_nnz = csr_nnz; - RETURN_IF_HIP_ERROR(hipMemcpy( - workspace2, csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToDevice)); - } - else - { - hipLaunchKernelGGL((hyb_coo_nnz_part1), - dim3((m - 1) / CSR2ELL_DIM + 1), - dim3(CSR2ELL_DIM), - 0, - stream, - m, - hyb->ell_width, - csr_row_ptr, - workspace, - workspace2); - - hipLaunchKernelGGL((hyb_coo_nnz_part2), - dim3(1), - dim3(CSR2ELL_DIM), - 0, - stream, - blocks, - workspace); - - RETURN_IF_HIP_ERROR( - hipMemcpy(&hyb->coo_nnz, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - - // Perform exclusive scan on workspace TODO use rocPRIM - std::vector hbuf(m + 1); - RETURN_IF_HIP_ERROR(hipMemcpy( - hbuf.data() + 1, workspace2, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); - - hbuf[0] = descr->base; - for(rocsparse_int i = 0; i < m; ++i) - { - hbuf[i + 1] += hbuf[i]; - } - - RETURN_IF_HIP_ERROR(hipMemcpy( - workspace2, hbuf.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); - } - } - - RETURN_IF_HIP_ERROR(hipFree(workspace)); - - // Allocate COO part - if(hyb->coo_nnz > 0) - { - RETURN_IF_HIP_ERROR( - hipMalloc((void**)&hyb->coo_row_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); - RETURN_IF_HIP_ERROR( - hipMalloc((void**)&hyb->coo_col_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); - RETURN_IF_HIP_ERROR(hipMalloc(&hyb->coo_val, sizeof(T) * hyb->coo_nnz)); - } - - dim3 csr2ell_blocks((m - 1) / CSR2ELL_DIM + 1); - dim3 csr2ell_threads(CSR2ELL_DIM); - - hipLaunchKernelGGL((csr2ell_kernel), - csr2ell_blocks, - csr2ell_threads, - 0, - stream, - m, - csr_val, - csr_row_ptr, - csr_col_ind, - hyb->ell_width, - hyb->ell_col_ind, - (T*)hyb->ell_val, - hyb->coo_row_ind, - hyb->coo_col_ind, - (T*)hyb->coo_val, - workspace2, - descr->base); - - RETURN_IF_HIP_ERROR(hipFree(workspace2)); -#undef CSR2ELL_DIM - - return rocsparse_status_success; -} +#include "rocsparse_csr2hyb.hpp" /* * =========================================================================== diff --git a/library/src/conversion/rocsparse_csr2hyb.hpp b/library/src/conversion/rocsparse_csr2hyb.hpp new file mode 100644 index 00000000..87783b7f --- /dev/null +++ b/library/src/conversion/rocsparse_csr2hyb.hpp @@ -0,0 +1,308 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_CSR2HYB_HPP +#define ROCSPARSE_CSR2HYB_HPP + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "csr2hyb_device.h" + +#include + +template +rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_hyb_mat hyb, + rocsparse_int user_ell_width, + rocsparse_hyb_partition partition_type) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(hyb == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + log_trace(handle, + replaceX("rocsparse_Xcsr2hyb"), + m, + n, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)hyb, + user_ell_width, + partition_type); + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + // Check matrix type + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + // Check partition type + if(partition_type != rocsparse_hyb_partition_max && + partition_type != rocsparse_hyb_partition_user && + partition_type != rocsparse_hyb_partition_auto) + { + return rocsparse_status_invalid_value; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0) + { + return rocsparse_status_success; + } + + // Get number of CSR non-zeros + rocsparse_int csr_nnz; + RETURN_IF_HIP_ERROR( + hipMemcpy(&csr_nnz, csr_row_ptr + m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + // Correct by index base + csr_nnz -= descr->base; + + // Check user_ell_width + if(partition_type == rocsparse_hyb_partition_user) + { + // ELL width cannot be 0 or negative + if(user_ell_width < 0) + { + return rocsparse_status_invalid_value; + } + + rocsparse_int max_row_nnz = (2 * csr_nnz - 1) / m + 1; + if(user_ell_width > max_row_nnz) + { + return rocsparse_status_invalid_value; + } + } + + // Stream + hipStream_t stream = handle->stream; + + // Clear HYB structure if already allocated + hyb->m = m; + hyb->n = n; + hyb->partition = partition_type; + hyb->ell_nnz = 0; + hyb->ell_width = 0; + hyb->coo_nnz = 0; + + if(hyb->ell_col_ind) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->ell_col_ind)); + } + if(hyb->ell_val) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->ell_val)); + } + if(hyb->coo_row_ind) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->coo_row_ind)); + } + if(hyb->coo_col_ind) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->coo_col_ind)); + } + if(hyb->coo_val) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->coo_val)); + } + +// Determine ELL width + +#define CSR2ELL_DIM 512 + // Workspace size + rocsparse_int blocks = (m - 1) / CSR2ELL_DIM + 1; + // Allocate workspace + rocsparse_int* workspace = NULL; + RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(rocsparse_int) * blocks)); + + if(partition_type == rocsparse_hyb_partition_user) + { + // ELL width given by user + hyb->ell_width = user_ell_width; + } + else if(partition_type == rocsparse_hyb_partition_auto) + { + // ELL width determined by average nnz per row + hyb->ell_width = (csr_nnz - 1) / m + 1; + } + else + { + // HYB == ELL - no COO part - compute maximum nnz per row + hipLaunchKernelGGL((ell_width_kernel_part1), + dim3(blocks), + dim3(CSR2ELL_DIM), + 0, + stream, + m, + csr_row_ptr, + workspace); + + hipLaunchKernelGGL((ell_width_kernel_part2), + dim3(1), + dim3(CSR2ELL_DIM), + 0, + stream, + blocks, + workspace); + // Copy ell width back to host + RETURN_IF_HIP_ERROR( + hipMemcpy(&hyb->ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + } + + // Compute ELL non-zeros + hyb->ell_nnz = hyb->ell_width * m; + + // Allocate ELL part + if(hyb->ell_nnz > 0) + { + RETURN_IF_HIP_ERROR( + hipMalloc((void**)&hyb->ell_col_ind, sizeof(rocsparse_int) * hyb->ell_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T) * hyb->ell_nnz)); + } + + // Allocate workspace2 + rocsparse_int* workspace2 = NULL; + RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace2, sizeof(rocsparse_int) * (m + 1))); + + // If there is a COO part, compute the COO non-zero elements per row + if(partition_type != rocsparse_hyb_partition_max) + { + // If there is no ELL part, its easy... + if(hyb->ell_nnz == 0) + { + hyb->coo_nnz = csr_nnz; + RETURN_IF_HIP_ERROR(hipMemcpy( + workspace2, csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToDevice)); + } + else + { + hipLaunchKernelGGL((hyb_coo_nnz_part1), + dim3((m - 1) / CSR2ELL_DIM + 1), + dim3(CSR2ELL_DIM), + 0, + stream, + m, + hyb->ell_width, + csr_row_ptr, + workspace, + workspace2); + + hipLaunchKernelGGL((hyb_coo_nnz_part2), + dim3(1), + dim3(CSR2ELL_DIM), + 0, + stream, + blocks, + workspace); + + RETURN_IF_HIP_ERROR( + hipMemcpy(&hyb->coo_nnz, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + // Perform exclusive scan on workspace TODO use rocPRIM + std::vector hbuf(m + 1); + RETURN_IF_HIP_ERROR(hipMemcpy( + hbuf.data() + 1, workspace2, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); + + hbuf[0] = descr->base; + for(rocsparse_int i = 0; i < m; ++i) + { + hbuf[i + 1] += hbuf[i]; + } + + RETURN_IF_HIP_ERROR(hipMemcpy( + workspace2, hbuf.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + } + } + + RETURN_IF_HIP_ERROR(hipFree(workspace)); + + // Allocate COO part + if(hyb->coo_nnz > 0) + { + RETURN_IF_HIP_ERROR( + hipMalloc((void**)&hyb->coo_row_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); + RETURN_IF_HIP_ERROR( + hipMalloc((void**)&hyb->coo_col_ind, sizeof(rocsparse_int) * hyb->coo_nnz)); + RETURN_IF_HIP_ERROR(hipMalloc(&hyb->coo_val, sizeof(T) * hyb->coo_nnz)); + } + + dim3 csr2ell_blocks((m - 1) / CSR2ELL_DIM + 1); + dim3 csr2ell_threads(CSR2ELL_DIM); + + hipLaunchKernelGGL((csr2ell_kernel), + csr2ell_blocks, + csr2ell_threads, + 0, + stream, + m, + csr_val, + csr_row_ptr, + csr_col_ind, + hyb->ell_width, + hyb->ell_col_ind, + (T*)hyb->ell_val, + hyb->coo_row_ind, + hyb->coo_col_ind, + (T*)hyb->coo_val, + workspace2, + descr->base); + + RETURN_IF_HIP_ERROR(hipFree(workspace2)); +#undef CSR2ELL_DIM + + return rocsparse_status_success; +} + +#endif // ROCSPARSE_CSR2HYB_HPP diff --git a/library/src/level1/axpyi_device.h b/library/src/level1/axpyi_device.h new file mode 100644 index 00000000..364f26d7 --- /dev/null +++ b/library/src/level1/axpyi_device.h @@ -0,0 +1,29 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef AXPYI_DEVICE_H +#define AXPYI_DEVICE_H + +#include + +template +__device__ void axpyi_device(rocsparse_int nnz, + T alpha, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base) +{ + int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(tid >= nnz) + { + return; + } + + y[x_ind[tid] - idx_base] += alpha * x_val[tid]; +} + +#endif // AXPYI_DEVICE_H diff --git a/library/src/level1/rocsparse_axpyi.cpp b/library/src/level1/rocsparse_axpyi.cpp index 05789e2b..0007a89d 100644 --- a/library/src/level1/rocsparse_axpyi.cpp +++ b/library/src/level1/rocsparse_axpyi.cpp @@ -3,190 +3,7 @@ * ************************************************************************ */ #include "rocsparse.h" -#include "handle.h" -#include "utility.h" - -#include - -template -__device__ void axpyi_device(rocsparse_int nnz, - T alpha, - const T* x_val, - const rocsparse_int* x_ind, - T* y, - rocsparse_index_base idx_base) -{ - int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - if(tid >= nnz) - { - return; - } - - y[x_ind[tid] - idx_base] += alpha * x_val[tid]; -} - -template -__global__ void axpyi_kernel_host_scalar(rocsparse_int nnz, - T alpha, - const T* x_val, - const rocsparse_int* x_ind, - T* y, - rocsparse_index_base idx_base) -{ - axpyi_device(nnz, alpha, x_val, x_ind, y, idx_base); -} - -template -__global__ void axpyi_kernel_device_scalar(rocsparse_int nnz, - const T* alpha, - const T* x_val, - const rocsparse_int* x_ind, - T* y, - rocsparse_index_base idx_base) -{ - axpyi_device(nnz, *alpha, x_val, x_ind, y, idx_base); -} - -/*! \brief SPARSE Level 1 API - - \details - axpyi compute y := alpha * x + y - - @param[in] - handle rocsparse_handle. - handle to the rocsparse library context queue. - @param[in] - nnz number of non-zero entries in x - if nnz <= 0 quick return with rocsparse_status_success - @param[in] - alpha scalar alpha. - @param[in] - x_val pointer storing vector x non-zero values on the GPU. - @param[in] - x_ind pointer storing vector x non-zero value indices on the GPU. - @param[inout] - y pointer storing y on the GPU. - @param[in] - idx_base specifies the index base. - - ********************************************************************/ -template -rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, - rocsparse_int nnz, - const T* alpha, - const T* x_val, - const rocsparse_int* x_ind, - T* y, - rocsparse_index_base idx_base) -{ - // Check for valid handle - if(handle == nullptr) - { - return rocsparse_status_invalid_handle; - } - - // Logging // TODO bench logging - if(handle->pointer_mode == rocsparse_pointer_mode_host) - { - log_trace(handle, - replaceX("rocsparse_Xaxpyi"), - nnz, - *alpha, - (const void*&)x_val, - (const void*&)x_ind, - (const void*&)y); - } - else - { - log_trace(handle, - replaceX("rocsparse_Xaxpyi"), - nnz, - (const void*&)alpha, - (const void*&)x_val, - (const void*&)x_ind, - (const void*&)y); - } - - // Check index base - if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) - { - return rocsparse_status_invalid_value; - } - - // Check size - if(nnz < 0) - { - return rocsparse_status_invalid_size; - } - - // Check pointer arguments - if(alpha == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(x_val == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(x_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(y == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Quick return if possible - if(nnz == 0) - { - return rocsparse_status_success; - } - - // Stream - hipStream_t stream = handle->stream; - -#define AXPYI_DIM 256 - dim3 axpyi_blocks((nnz - 1) / AXPYI_DIM + 1); - dim3 axpyi_threads(AXPYI_DIM); - - if(handle->pointer_mode == rocsparse_pointer_mode_device) - { - hipLaunchKernelGGL((axpyi_kernel_device_scalar), - axpyi_blocks, - axpyi_threads, - 0, - stream, - nnz, - alpha, - x_val, - x_ind, - y, - idx_base); - } - else - { - if(*alpha == 0.0) - { - return rocsparse_status_success; - } - - hipLaunchKernelGGL((axpyi_kernel_host_scalar), - axpyi_blocks, - axpyi_threads, - 0, - stream, - nnz, - *alpha, - x_val, - x_ind, - y, - idx_base); - } -#undef AXPYI_DIM - return rocsparse_status_success; -} +#include "rocsparse_axpyi.hpp" /* * =========================================================================== diff --git a/library/src/level1/rocsparse_axpyi.hpp b/library/src/level1/rocsparse_axpyi.hpp new file mode 100644 index 00000000..5688111a --- /dev/null +++ b/library/src/level1/rocsparse_axpyi.hpp @@ -0,0 +1,178 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_AXPYI_HPP +#define ROCSPARSE_AXPYI_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "axpyi_device.h" + +#include + +template +__global__ void axpyi_kernel_host_scalar(rocsparse_int nnz, + T alpha, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base) +{ + axpyi_device(nnz, alpha, x_val, x_ind, y, idx_base); +} + +template +__global__ void axpyi_kernel_device_scalar(rocsparse_int nnz, + const T* alpha, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base) +{ + axpyi_device(nnz, *alpha, x_val, x_ind, y, idx_base); +} + +/*! \brief SPARSE Level 1 API + + \details + axpyi compute y := alpha * x + y + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + nnz number of non-zero entries in x + if nnz <= 0 quick return with rocsparse_status_success + @param[in] + alpha scalar alpha. + @param[in] + x_val pointer storing vector x non-zero values on the GPU. + @param[in] + x_ind pointer storing vector x non-zero value indices on the GPU. + @param[inout] + y pointer storing y on the GPU. + @param[in] + idx_base specifies the index base. + + ********************************************************************/ +template +rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, + rocsparse_int nnz, + const T* alpha, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base) +{ + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging // TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xaxpyi"), + nnz, + *alpha, + (const void*&)x_val, + (const void*&)x_ind, + (const void*&)y); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xaxpyi"), + nnz, + (const void*&)alpha, + (const void*&)x_val, + (const void*&)x_ind, + (const void*&)y); + } + + // Check index base + if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check size + if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define AXPYI_DIM 256 + dim3 axpyi_blocks((nnz - 1) / AXPYI_DIM + 1); + dim3 axpyi_threads(AXPYI_DIM); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + hipLaunchKernelGGL((axpyi_kernel_device_scalar), + axpyi_blocks, + axpyi_threads, + 0, + stream, + nnz, + alpha, + x_val, + x_ind, + y, + idx_base); + } + else + { + if(*alpha == 0.0) + { + return rocsparse_status_success; + } + + hipLaunchKernelGGL((axpyi_kernel_host_scalar), + axpyi_blocks, + axpyi_threads, + 0, + stream, + nnz, + *alpha, + x_val, + x_ind, + y, + idx_base); + } +#undef AXPYI_DIM + return rocsparse_status_success; +} + +#endif // ROCSPARSE_AXPYI_HPP From 6c37c8dde5819964fd1cbe880427eba5a79739d5 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 16:00:50 +0200 Subject: [PATCH 101/304] csr2ell added with tests --- .../rocsparse_template_specialization.cpp | 30 ++ clients/include/rocsparse.hpp | 12 + clients/include/testing_csr2ell.hpp | 476 ++++++++++++++++++ clients/include/utility.hpp | 14 +- clients/tests/CMakeLists.txt | 1 + clients/tests/test_csr2ell.cpp | 65 +++ library/include/rocsparse-functions.h | 42 ++ library/src/CMakeLists.txt | 4 +- library/src/conversion/csr2ell_device.h | 136 +++++ library/src/conversion/csr2hyb_device.h | 82 +-- library/src/conversion/rocsparse_csr2ell.cpp | 164 ++++++ library/src/conversion/rocsparse_csr2ell.hpp | 136 +++++ library/src/conversion/rocsparse_csr2hyb.hpp | 3 +- 13 files changed, 1077 insertions(+), 88 deletions(-) create mode 100644 clients/include/testing_csr2ell.hpp create mode 100644 clients/tests/test_csr2ell.cpp create mode 100644 library/src/conversion/csr2ell_device.h create mode 100644 library/src/conversion/rocsparse_csr2ell.cpp create mode 100644 library/src/conversion/rocsparse_csr2ell.hpp diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 1331bdc9..f4bf41aa 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -134,6 +134,36 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, return rocsparse_dhybmv(handle, trans, alpha, descr, hyb, x, beta, y); } +template <> +rocsparse_status rocsparse_csr2ell(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + float* ell_val, + rocsparse_int* ell_col_ind) +{ + return rocsparse_scsr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); +} + +template <> +rocsparse_status rocsparse_csr2ell(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + double* ell_val, + rocsparse_int* ell_col_ind) +{ + return rocsparse_dcsr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); +} + template <> rocsparse_status rocsparse_csr2hyb(rocsparse_handle handle, rocsparse_int m, diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index 47858b7f..13c9e992 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -58,6 +58,18 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, const T* beta, T* y); +template +rocsparse_status rocsparse_csr2ell(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + T* ell_val, + rocsparse_int* ell_col_ind); + template rocsparse_status rocsparse_csr2hyb(rocsparse_handle handle, rocsparse_int m, diff --git a/clients/include/testing_csr2ell.hpp b/clients/include/testing_csr2ell.hpp new file mode 100644 index 00000000..3abb1695 --- /dev/null +++ b/clients/include/testing_csr2ell.hpp @@ -0,0 +1,476 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_CSR2ELL_HPP +#define TESTING_CSR2ELL_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +#define ELL_IND_ROW(i, el, m, width) (el) * (m) + (i) +#define ELL_IND_EL(i, el, m, width) (el) + (width) * (i) +#define ELL_IND(i, el, m, width) ELL_IND_ROW(i, el, m, width) + +template +void testing_csr2ell_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int safe_size = 100; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_csr_descr(new descr_struct); + rocsparse_mat_descr csr_descr = unique_ptr_csr_descr->descr; + + std::unique_ptr unique_ptr_ell_descr(new descr_struct); + rocsparse_mat_descr ell_descr = unique_ptr_ell_descr->descr; + + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); + + if(!csr_row_ptr || !csr_col_ind || !csr_val) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // CSR to ELL conversion is a two step process - test both functions for bad arguments + + // Step 1: Determine number of non-zero elements of ELL storage format + rocsparse_int ell_width; + + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr_null, ell_descr, &ell_width); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + + // Testing for (ell_widht == nullptr) + { + rocsparse_int* ell_width_null = nullptr; + + status = rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr, ell_descr, ell_width_null); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_width is nullptr"); + } + + // Testing for (csr_descr == nullptr) + { + rocsparse_mat_descr csr_descr_null = nullptr; + + status = rocsparse_csr2ell_width(handle, m, csr_descr_null, csr_row_ptr, ell_descr, &ell_width); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_descr is nullptr"); + } + + // Testing for (ell_descr == nullptr) + { + rocsparse_mat_descr ell_descr_null = nullptr; + + status = rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr, ell_descr_null, &ell_width); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_descr is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csr2ell_width(handle_null, m, csr_descr, csr_row_ptr, ell_descr, &ell_width); + verify_rocsparse_status_invalid_handle(status); + } + + // Allocate memory for ELL storage format + auto ell_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto ell_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* ell_col_ind = (rocsparse_int*)ell_col_ind_managed.get(); + T* ell_val = (T*)ell_val_managed.get(); + + if(!ell_col_ind || !ell_val) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // Step 2: Perform the actual conversion + + // Set ell_width to some valid value, to avoid invalid_size status + ell_width = 10; + + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr_null, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + + // Testing for (csr_col_ind == nullptr) + { + rocsparse_int* csr_col_ind_null = nullptr; + + status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind_null, ell_descr, ell_width, ell_val, ell_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); + } + + // Testing for (csr_val == nullptr) + { + T* csr_val_null = nullptr; + + status = rocsparse_csr2ell(handle, m, csr_descr, csr_val_null, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_val is nullptr"); + } + + // Testing for (ell_col_ind == nullptr) + { + rocsparse_int* ell_col_ind_null = nullptr; + + status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind_null); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_col_ind is nullptr"); + } + + // Testing for (ell_val == nullptr) + { + T* ell_val_null = nullptr; + + status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val_null, ell_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_val is nullptr"); + } + + // Testing for (csr_descr == nullptr) + { + rocsparse_mat_descr csr_descr_null = nullptr; + + status = rocsparse_csr2ell(handle, m, csr_descr_null, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_descr is nullptr"); + } + + // Testing for (ell_descr == nullptr) + { + rocsparse_mat_descr ell_descr_null = nullptr; + + status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr_null, ell_width, ell_val, ell_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_descr is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csr2ell(handle_null, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_csr2ell(Arguments argus) +{ + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_index_base csr_base = argus.idx_base; + rocsparse_index_base ell_base = argus.idx_base2; + rocsparse_status status; + + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_csr_descr(new descr_struct); + rocsparse_mat_descr csr_descr = unique_ptr_csr_descr->descr; + + // Set CSR matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(csr_descr, csr_base)); + + std::unique_ptr unique_ptr_ell_descr(new descr_struct); + rocsparse_mat_descr ell_descr = unique_ptr_ell_descr->descr; + + // Set ELL matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(ell_descr, ell_base)); + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto csr_row_ptr_managed = (m > 0) ? rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free} : rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); + + if(!csr_row_ptr || !csr_col_ind || !csr_val) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!csr_row_ptr || !csr_col_ind || !csr_val"); + return rocsparse_status_memory_error; + } + + // To obtain valid input, csr_row_ptr need to be 0 (because either m, n or nnz is 0) + hipMemset(csr_row_ptr, 0, sizeof(rocsparse_int) * ((m > 0) ? (m + 1) : safe_size)); + + // Step 1 + rocsparse_int ell_width; + status = rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr, ell_descr, &ell_width); + + if(m < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0"); + } + + auto ell_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto ell_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* ell_col_ind = (rocsparse_int*)ell_col_ind_managed.get(); + T* ell_val = (T*)ell_val_managed.get(); + + if(!ell_col_ind || !ell_val) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!ell_col_ind || !ell_val"); + return rocsparse_status_memory_error; + } + + // Step 2 + status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, 0, ell_val, ell_col_ind); + + if(m < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0"); + } + + return rocsparse_status_success; + } + + // For testing, assemble a COO matrix and convert it to CSR first (on host) + + // Host structures + std::vector hcsr_row_ptr; + std::vector hcoo_row_ind; + std::vector hcsr_col_ind; + std::vector hcsr_val; + + // Sample initial COO matrix on CPU + srand(12345ULL); + if(argus.laplacian) + { + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, csr_base); + nnz = hcsr_row_ptr[m]; + } + else + { + if(argus.filename != "") + { + if(read_mtx_matrix( + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, csr_base); + } + + // Convert COO to CSR + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - csr_base]; + } + + hcsr_row_ptr[0] = csr_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } + } + + // Allocate memory on the device + auto dcsr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcsr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcsr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + + rocsparse_int* dcsr_row_ptr = (rocsparse_int*)dcsr_row_ptr_managed.get(); + rocsparse_int* dcsr_col_ind = (rocsparse_int*)dcsr_col_ind_managed.get(); + T* dcsr_val = (T*)dcsr_val_managed.get(); + + if(!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val"); + return rocsparse_status_memory_error; + } + + // Copy data from host to device + CHECK_HIP_ERROR(hipMemcpy( + dcsr_row_ptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy( + dcsr_col_ind, hcsr_col_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_val, hcsr_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + + // Host csr2ell conversion + rocsparse_int ell_width_gold = 0; + + // Determine max nnz per row + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int row_nnz = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; + ell_width_gold = (row_nnz > ell_width_gold) ? row_nnz : ell_width_gold; + } + + rocsparse_int ell_nnz_gold = ell_width_gold * m; + + // Allocate host memory + std::vector hell_col_ind_gold(ell_nnz_gold); + std::vector hell_val_gold(ell_nnz_gold); + + // Fill ELL structures + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int p = 0; + for(rocsparse_int j = hcsr_row_ptr[i] - csr_base; j < hcsr_row_ptr[i + 1] - csr_base; ++j) + { + if(p >= ell_width_gold) + { + break; + } + + rocsparse_int idx = ELL_IND(i, p++, m, ell_width_gold); + hell_col_ind_gold[idx] = hcsr_col_ind[j] - csr_base + ell_base; + hell_val_gold[idx] = hcsr_val[j]; + } + for(rocsparse_int j = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; j < ell_width_gold; ++j) + { + rocsparse_int idx = ELL_IND(i, p++, m, ell_width_gold); + hell_col_ind_gold[idx] = -1; + hell_val_gold[idx] = static_cast(0); + } + } + + // Allocate verification structures + std::vector hell_col_ind(ell_nnz_gold); + std::vector hell_val(ell_nnz_gold); + rocsparse_int ell_width; + + if(argus.unit_check) + { + CHECK_ROCSPARSE_ERROR(rocsparse_csr2ell_width(handle, m, csr_descr, dcsr_row_ptr, ell_descr, &ell_width)); + + rocsparse_int ell_nnz = ell_width * m; + + // Check if ELL width does match + unit_check_general(1, 1, &ell_width_gold, &ell_width); + unit_check_general(1, 1, &ell_nnz_gold, &ell_nnz); + + // Allocate ELL device memory + auto dell_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * ell_nnz), device_free}; + auto dell_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; + + rocsparse_int* dell_col_ind = (rocsparse_int*)dell_col_ind_managed.get(); + T* dell_val = (T*)dell_val_managed.get(); + + // Perform actual ELL conversion + CHECK_ROCSPARSE_ERROR(rocsparse_csr2ell(handle, m, csr_descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, ell_descr, ell_width, dell_val, dell_col_ind)); + + CHECK_HIP_ERROR(hipMemcpy(hell_col_ind.data(), + dell_col_ind, + sizeof(rocsparse_int) * ell_nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + hell_val.data(), dell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); + + // Unit check + unit_check_general(1, ell_nnz, hell_col_ind_gold.data(), hell_col_ind.data()); + unit_check_general(1, ell_nnz, hell_val_gold.data(), hell_val.data()); + } + + if(argus.timing) + { + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; + + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_csr2ell_width(handle, m, csr_descr, dcsr_row_ptr, ell_descr, &ell_width); + rocsparse_int ell_nnz = ell_width * m; + + auto dell_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * ell_nnz), device_free}; + auto dell_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; + + rocsparse_int* dell_col_ind = (rocsparse_int*)dell_col_ind_managed.get(); + T* dell_val = (T*)dell_val_managed.get(); + + rocsparse_csr2ell(handle, m, csr_descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, ell_descr, ell_width, dell_val, dell_col_ind); + } + + double gpu_time_used = get_time_us(); + + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_csr2ell_width(handle, m, csr_descr, dcsr_row_ptr, ell_descr, &ell_width); + rocsparse_int ell_nnz = ell_width * m; + + auto dell_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * ell_nnz), device_free}; + auto dell_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; + + rocsparse_int* dell_col_ind = (rocsparse_int*)dell_col_ind_managed.get(); + T* dell_val = (T*)dell_val_managed.get(); + + rocsparse_csr2ell(handle, m, csr_descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, ell_descr, ell_width, dell_val, dell_col_ind); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + + printf("m\t\tn\t\tnnz\t\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\n", m, n, nnz, gpu_time_used); + } + + return rocsparse_status_success; +} + +#endif // TESTING_CSR2ELL_HPP diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index 30e2f437..31bb4254 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -600,9 +600,10 @@ class Arguments double alpha = 1.0; double beta = 0.0; - rocsparse_operation trans = rocsparse_operation_none; - rocsparse_index_base idx_base = rocsparse_index_base_zero; - rocsparse_hyb_partition part = rocsparse_hyb_partition_auto; + rocsparse_operation trans = rocsparse_operation_none; + rocsparse_index_base idx_base = rocsparse_index_base_zero; + rocsparse_index_base idx_base2 = rocsparse_index_base_zero; + rocsparse_hyb_partition part = rocsparse_hyb_partition_auto; rocsparse_int norm_check = 0; rocsparse_int unit_check = 1; @@ -623,9 +624,10 @@ class Arguments alpha = rhs.alpha; beta = rhs.beta; - trans = rhs.trans; - idx_base = rhs.idx_base; - part = rhs.part; + trans = rhs.trans; + idx_base = rhs.idx_base; + idx_base2 = rhs.idx_base2; + part = rhs.part; norm_check = rhs.norm_check; unit_check = rhs.unit_check; diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 74cb50b2..ba0bd62c 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -11,6 +11,7 @@ set(ROCSPARSE_TEST_SOURCES test_csrmv.cpp test_hybmv.cpp test_csr2coo.cpp + test_csr2ell.cpp test_csr2hyb.cpp test_coo2csr.cpp ) diff --git a/clients/tests/test_csr2ell.cpp b/clients/tests/test_csr2ell.cpp new file mode 100644 index 00000000..4ecc8231 --- /dev/null +++ b/clients/tests/test_csr2ell.cpp @@ -0,0 +1,65 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_csr2ell.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef std::tuple csr2ell_tuple; + +int csr2ell_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int csr2ell_N_range[] = {-3, 0, 33, 242, 623, 1000}; + +rocsparse_index_base csr2ell_csr_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; +rocsparse_index_base csr2ell_ell_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; + +class parameterized_csr2ell : public testing::TestWithParam +{ + protected: + parameterized_csr2ell() {} + virtual ~parameterized_csr2ell() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csr2ell_arguments(csr2ell_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.idx_base2 = std::get<3>(tup); + arg.timing = 0; + return arg; +} + +TEST(csr2ell_bad_arg, csr2ell) { testing_csr2ell_bad_arg(); } + +TEST_P(parameterized_csr2ell, csr2ell_float) +{ + Arguments arg = setup_csr2ell_arguments(GetParam()); + + rocsparse_status status = testing_csr2ell(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csr2ell, csr2ell_double) +{ + Arguments arg = setup_csr2ell_arguments(GetParam()); + + rocsparse_status status = testing_csr2ell(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csr2ell, + parameterized_csr2ell, + testing::Combine(testing::ValuesIn(csr2ell_M_range), + testing::ValuesIn(csr2ell_N_range), + testing::ValuesIn(csr2ell_csr_base_range), + testing::ValuesIn(csr2ell_ell_base_range))); diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index bc370baf..0ca10bb2 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -410,6 +410,48 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, rocsparse_int* coo_row_ind, rocsparse_index_base idx_base); +/*! \brief SPARSE Format Conversions API + + \details + csr2ell converts a CSR matrix into an ELL matrix. + + // TODO + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_mat_descr ell_descr, + rocsparse_int* ell_width); + +// TODO descr. text + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsr2ell(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + float* ell_val, + rocsparse_int* ell_col_ind); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsr2ell(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + double* ell_val, + rocsparse_int* ell_col_ind); + /*! \brief SPARSE Format Conversions API \details diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index cf00dcca..df839cf5 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -10,8 +10,10 @@ set(rocsparse_source src/level1/rocsparse_axpyi.cpp src/level2/rocsparse_coomv.cpp src/level2/rocsparse_csrmv.cpp + src/level2/rocsparse_ellmv.cpp src/level2/rocsparse_hybmv.cpp - src/conversion/rocsparse_csr2hyb.cpp src/conversion/rocsparse_csr2coo.cpp + src/conversion/rocsparse_csr2ell.cpp + src/conversion/rocsparse_csr2hyb.cpp src/conversion/rocsparse_coo2csr.cpp ) diff --git a/library/src/conversion/csr2ell_device.h b/library/src/conversion/csr2ell_device.h new file mode 100644 index 00000000..7c709fff --- /dev/null +++ b/library/src/conversion/csr2ell_device.h @@ -0,0 +1,136 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef CSR2ELL_DEVICE_H +#define CSR2ELL_DEVICE_H + +//#include "handle.h" + +#include + +template +__device__ void ell_width_reduce(rocsparse_int tid, rocsparse_int* data) +{ + __syncthreads(); + + for(int i = NB >> 1; i > 0; i >>= 1) + { + if(tid < i) + { + data[tid] = max(data[tid], data[tid + i]); + } + + __syncthreads(); + } +} + +template +__global__ void +ell_width_kernel_part1(rocsparse_int m, const rocsparse_int* csr_row_ptr, rocsparse_int* workspace) +{ + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + __shared__ rocsparse_int sdata[NB]; + + if(gid < m) + { + sdata[tid] = csr_row_ptr[gid + 1] - csr_row_ptr[gid]; + } + else + { + sdata[tid] = 0; + } + + ell_width_reduce(tid, sdata); + + if(tid == 0) + { + workspace[hipBlockIdx_x] = sdata[0]; + } +} + +template +__global__ void ell_width_kernel_part2(rocsparse_int m, rocsparse_int* workspace) +{ + rocsparse_int tid = hipThreadIdx_x; + + __shared__ rocsparse_int sdata[NB]; + sdata[tid] = 0; + + for(rocsparse_int i = tid; i < m; i += NB) + { + sdata[tid] = (workspace[i] > sdata[tid]) ? workspace[i] : sdata[tid]; + } + + __syncthreads(); + + if(m < 32) + { + if(tid == 0) + { + for(rocsparse_int i = 1; i < m; ++i) + { + sdata[0] = (sdata[i] > sdata[0]) ? sdata[i] : sdata[0]; + } + } + } + else + { + ell_width_reduce(tid, sdata); + } + + if(tid == 0) + { + workspace[0] = sdata[0]; + } +} + +template +__global__ void csr2ell_kernel(rocsparse_int m, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_index_base csr_idx_base, + rocsparse_int ell_width, + rocsparse_int* ell_col_ind, + T* ell_val, + rocsparse_index_base ell_idx_base) +{ + rocsparse_int ai = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(ai >= m) + { + return; + } + + rocsparse_int p = 0; + + rocsparse_int row_begin = csr_row_ptr[ai] - csr_idx_base; + rocsparse_int row_end = csr_row_ptr[ai + 1] - csr_idx_base; + + // Fill HYB matrix + for(rocsparse_int aj = row_begin; aj < row_end; ++aj) + { + if(p >= ell_width) + { + break; + } + + rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); + ell_col_ind[idx] = csr_col_ind[aj] - csr_idx_base + ell_idx_base; + ell_val[idx] = csr_val[aj]; + } + + // Pad remaining ELL structure + for(rocsparse_int aj = row_end - row_begin; aj < ell_width; ++aj) + { + rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); + ell_col_ind[idx] = -1; + ell_val[idx] = static_cast(0); + } +} + +#endif // CSR2ELL_DEVICE_H diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index e0a05d10..3c4ba569 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -103,86 +103,8 @@ __global__ void hyb_coo_nnz_part2(rocsparse_int m, rocsparse_int* workspace) } } -template -__device__ void ell_width_reduce(rocsparse_int tid, rocsparse_int* data) -{ - __syncthreads(); - - for(int i = NB >> 1; i > 0; i >>= 1) - { - if(tid < i) - { - data[tid] = max(data[tid], data[tid + i]); - } - - __syncthreads(); - } -} - -template -__global__ void -ell_width_kernel_part1(rocsparse_int m, const rocsparse_int* csr_row_ptr, rocsparse_int* workspace) -{ - rocsparse_int tid = hipThreadIdx_x; - rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - __shared__ rocsparse_int sdata[NB]; - - if(gid < m) - { - sdata[tid] = csr_row_ptr[gid + 1] - csr_row_ptr[gid]; - } - else - { - sdata[tid] = 0; - } - - ell_width_reduce(tid, sdata); - - if(tid == 0) - { - workspace[hipBlockIdx_x] = sdata[0]; - } -} - -template -__global__ void ell_width_kernel_part2(rocsparse_int m, rocsparse_int* workspace) -{ - rocsparse_int tid = hipThreadIdx_x; - - __shared__ rocsparse_int sdata[NB]; - sdata[tid] = 0; - - for(rocsparse_int i = tid; i < m; i += NB) - { - sdata[tid] = (workspace[i] > sdata[tid]) ? workspace[i] : sdata[tid]; - } - - __syncthreads(); - - if(m < 32) - { - if(tid == 0) - { - for(rocsparse_int i = 1; i < m; ++i) - { - sdata[0] = (sdata[i] > sdata[0]) ? sdata[i] : sdata[0]; - } - } - } - else - { - ell_width_reduce(tid, sdata); - } - - if(tid == 0) - { - workspace[0] = sdata[0]; - } -} - template -__global__ void csr2ell_kernel(rocsparse_int m, +__global__ void csr2hyb_kernel(rocsparse_int m, const T* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, @@ -193,7 +115,7 @@ __global__ void csr2ell_kernel(rocsparse_int m, rocsparse_int* coo_col_ind, T* coo_val, rocsparse_int* workspace, - rocsparse_int idx_base) + rocsparse_index_base idx_base) { rocsparse_int ai = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; diff --git a/library/src/conversion/rocsparse_csr2ell.cpp b/library/src/conversion/rocsparse_csr2ell.cpp new file mode 100644 index 00000000..1263e49c --- /dev/null +++ b/library/src/conversion/rocsparse_csr2ell.cpp @@ -0,0 +1,164 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_csr2ell.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_mat_descr ell_descr, + rocsparse_int* ell_width) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(csr_descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(ell_descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_csr2ell_width", + m, + (const void*&)csr_descr, + (const void*&)csr_row_ptr, + (const void*&)ell_descr, + (const void*&)ell_width); + + // Check index base + if(csr_descr->base != rocsparse_index_base_zero && csr_descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(ell_descr->base != rocsparse_index_base_zero && ell_descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + // Check matrix type + if(csr_descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + if(ell_descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(ell_width == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0) + { + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + RETURN_IF_HIP_ERROR(hipMemset(ell_width, 0, sizeof(rocsparse_int))); + } + else + { + *ell_width = 0; + } + return rocsparse_status_success; + } + + hipStream_t stream = handle->stream; + + // Determine ELL width + +#define CSR2ELL_DIM 512 + // Workspace size + rocsparse_int blocks = (m - 1) / CSR2ELL_DIM + 1; + + // Allocate workspace + rocsparse_int* workspace = NULL; + RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(rocsparse_int) * blocks)); + + // Compute maximum nnz per row + hipLaunchKernelGGL((ell_width_kernel_part1), + dim3(blocks), + dim3(CSR2ELL_DIM), + 0, + stream, + m, + csr_row_ptr, + workspace); + + hipLaunchKernelGGL((ell_width_kernel_part2), + dim3(1), + dim3(CSR2ELL_DIM), + 0, + stream, + blocks, + workspace); + + // Copy ELL width back to host, if handle says so + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + RETURN_IF_HIP_ERROR(hipMemcpy(ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToDevice)); + } + else + { + RETURN_IF_HIP_ERROR(hipMemcpy(ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + } + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_scsr2ell(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + float* ell_val, + rocsparse_int* ell_col_ind) +{ + return rocsparse_csr2ell_template(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); +} + +extern "C" rocsparse_status rocsparse_dcsr2ell(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + double* ell_val, + rocsparse_int* ell_col_ind) +{ + return rocsparse_csr2ell_template(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); +} diff --git a/library/src/conversion/rocsparse_csr2ell.hpp b/library/src/conversion/rocsparse_csr2ell.hpp new file mode 100644 index 00000000..80ce56fc --- /dev/null +++ b/library/src/conversion/rocsparse_csr2ell.hpp @@ -0,0 +1,136 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_CSR2ELL_HPP +#define ROCSPARSE_CSR2ELL_HPP + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "csr2ell_device.h" + +#include + +template +rocsparse_status rocsparse_csr2ell_template(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + T* ell_val, + rocsparse_int* ell_col_ind) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(csr_descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(ell_descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + log_trace(handle, + replaceX("rocsparse_Xcsr2ell"), + m, + (const void*&)csr_descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)ell_descr, + ell_width, + (const void*&)ell_val, + (const void*&)ell_col_ind); + + // Check index base + if(csr_descr->base != rocsparse_index_base_zero && csr_descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(ell_descr->base != rocsparse_index_base_zero && ell_descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + // Check matrix type + if(csr_descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + if(ell_descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0 || ell_width < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(ell_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(ell_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || ell_width == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define CSR2ELL_DIM 512 + dim3 csr2ell_blocks((m - 1) / CSR2ELL_DIM + 1); + dim3 csr2ell_threads(CSR2ELL_DIM); + + hipLaunchKernelGGL((csr2ell_kernel), + csr2ell_blocks, + csr2ell_threads, + 0, + stream, + m, + csr_val, + csr_row_ptr, + csr_col_ind, + csr_descr->base, + ell_width, + ell_col_ind, + ell_val, + ell_descr->base); +#undef CSR2ELL_DIM + return rocsparse_status_success; +} + +#endif // ROCSPARSE_CSR2ELL_HPP diff --git a/library/src/conversion/rocsparse_csr2hyb.hpp b/library/src/conversion/rocsparse_csr2hyb.hpp index 87783b7f..2c33d088 100644 --- a/library/src/conversion/rocsparse_csr2hyb.hpp +++ b/library/src/conversion/rocsparse_csr2hyb.hpp @@ -11,6 +11,7 @@ #include "handle.h" #include "utility.h" #include "csr2hyb_device.h" +#include "csr2ell_device.h" #include @@ -281,7 +282,7 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, dim3 csr2ell_blocks((m - 1) / CSR2ELL_DIM + 1); dim3 csr2ell_threads(CSR2ELL_DIM); - hipLaunchKernelGGL((csr2ell_kernel), + hipLaunchKernelGGL((csr2hyb_kernel), csr2ell_blocks, csr2ell_threads, 0, From 5eac868ef10e629bcf95f81800a0b7d9bfaf4d5d Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 16:01:26 +0200 Subject: [PATCH 102/304] ellmv: added empty files --- library/src/level2/rocsparse_ellmv.cpp | 46 ++++++++++++++++ library/src/level2/rocsparse_ellmv.hpp | 72 ++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 library/src/level2/rocsparse_ellmv.cpp create mode 100644 library/src/level2/rocsparse_ellmv.hpp diff --git a/library/src/level2/rocsparse_ellmv.cpp b/library/src/level2/rocsparse_ellmv.cpp new file mode 100644 index 00000000..865d385f --- /dev/null +++ b/library/src/level2/rocsparse_ellmv.cpp @@ -0,0 +1,46 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_ellmv.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_sellmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* ell_val, + const rocsparse_int* ell_col_ind, + const float* x, + const float* beta, + float* y) +{ + return rocsparse_ellmv_template( + handle, trans, m, n, nnz, alpha, descr, ell_val, ell_col_ind, x, beta, y); +} + +extern "C" rocsparse_status rocsparse_dellmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* ell_val, + const rocsparse_int* ell_col_ind, + const double* x, + const double* beta, + double* y) +{ + return rocsparse_ellmv_template( + handle, trans, m, n, nnz, alpha, descr, ell_val, ell_col_ind, x, beta, y); +} diff --git a/library/src/level2/rocsparse_ellmv.hpp b/library/src/level2/rocsparse_ellmv.hpp new file mode 100644 index 00000000..6e5ab3b5 --- /dev/null +++ b/library/src/level2/rocsparse_ellmv.hpp @@ -0,0 +1,72 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_ELLMV_HPP +#define ROCSPARSE_ELLMV_HPP + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "ellmv_device.h" + +#include + +/*! \brief SPARSE Level 2 API + + \details + ellmv multiplies the dense vector x[i] with scalar alpha and sparse m x n + matrix A that is defined in ELL storage format and add the result to y[i] + that is multiplied by beta, for i = 1 , … , n + + y := alpha * op(A) * x + beta * y, + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + trans operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + alpha scalar alpha. + @param[in] + descr descriptor of A. + @param[in] + ell_val array of nnz elements of A. + @param[in] + ell_col_ind array of nnz elements containing the column indices of A. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ +template +rocsparse_status rocsparse_ellmv_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* ell_val, + const rocsparse_int* ell_col_ind, + const T* x, + const T* beta, + T* y) +{ + return rocsparse_status_not_implemented; +} + +#endif // ROCSPARSE_ELLMV_HPP From e927a55ddb679f558a5a6f8f95d81199438e06ae Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 28 May 2018 21:31:05 +0200 Subject: [PATCH 103/304] coomv: bugfix in shared memory usage --- library/src/level2/coomv_device.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h index b2457c3e..fb3c9039 100644 --- a/library/src/level2/coomv_device.h +++ b/library/src/level2/coomv_device.h @@ -55,6 +55,16 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, // Global COO array index start for current warp rocsparse_int offset = warpid * loops * WARPSIZE; + // Shared memory to hold row indices and values for segmented reduction + __shared__ rocsparse_int shared_row[BLOCKSIZE]; + __shared__ T shared_val[BLOCKSIZE]; + + // Initialize shared memory + shared_row[tid] = -1; + shared_val[tid] = static_cast(0); + + __syncthreads(); + // Quick return when thread is out of bounds if(offset + laneid >= nnz) { @@ -64,10 +74,6 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, rocsparse_int row; T val; - // Shared memory to hold row indices and values for segmented reduction - __shared__ rocsparse_int shared_row[BLOCKSIZE]; - __shared__ T shared_val[BLOCKSIZE]; - // Current threads index into COO structure rocsparse_int idx = offset + laneid; From 940bb6e0f34bc3b200000efe61a894fd31162f52 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 30 May 2018 08:12:52 +0200 Subject: [PATCH 104/304] ellmv: performance improvement --- library/src/level2/ellmv_device.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/library/src/level2/ellmv_device.h b/library/src/level2/ellmv_device.h index a96affbe..e45e095e 100644 --- a/library/src/level2/ellmv_device.h +++ b/library/src/level2/ellmv_device.h @@ -35,6 +35,10 @@ static __device__ void ellmvn_device(rocsparse_int m, { sum += ell_val[idx] * x[col]; } + else + { + break; + } } if(beta != static_cast(0)) From 4df16609048bfed7caef9b321d199328e71cd22c Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 30 May 2018 10:08:57 +0200 Subject: [PATCH 105/304] ellmv: implemented --- library/include/rocsparse-functions.h | 66 ++++++++ library/src/level2/rocsparse_ellmv.cpp | 8 +- library/src/level2/rocsparse_ellmv.hpp | 203 ++++++++++++++++++++++++- library/src/level2/rocsparse_hybmv.hpp | 115 +++----------- 4 files changed, 294 insertions(+), 98 deletions(-) diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 0ca10bb2..00b4a48a 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -298,6 +298,72 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, rocsparse_double_complex* y); */ +/*! \brief SPARSE Level 2 API + + \details + ellmv multiplies the dense vector x[i] with scalar alpha and sparse m x n + matrix A that is defined in ELL storage format and add the result to y[i] + that is multiplied by beta, for i = 1 , … , n + + y := alpha * op(A) * x + beta * y, + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + trans operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + alpha scalar alpha. + @param[in] + descr descriptor of A. + @param[in] + ell_val array of nnz elements of A. + @param[in] + ell_col_ind array of nnz elements containing the column indices of A. + @param[in] + ell_width ELL width that was pre-computed during format conversion. + @param[in] + x array of n elements (op(A) = A) or m elements (op(A) = A^T or + op(A) = A^H). + @param[in] + beta scalar beta. + @param[inout] + y array of m elements (op(A) = A) or n elements (op(A) = A^T or + op(A) = A^H). + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_sellmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + const float* alpha, + const rocsparse_mat_descr descr, + const float* ell_val, + const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, + const float* x, + const float* beta, + float* y); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dellmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + const double* alpha, + const rocsparse_mat_descr descr, + const double* ell_val, + const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, + const double* x, + const double* beta, + double* y); + /*! \brief SPARSE Level 2 API \details diff --git a/library/src/level2/rocsparse_ellmv.cpp b/library/src/level2/rocsparse_ellmv.cpp index 865d385f..703104f5 100644 --- a/library/src/level2/rocsparse_ellmv.cpp +++ b/library/src/level2/rocsparse_ellmv.cpp @@ -15,32 +15,32 @@ extern "C" rocsparse_status rocsparse_sellmv(rocsparse_handle handle, rocsparse_operation trans, rocsparse_int m, rocsparse_int n, - rocsparse_int nnz, const float* alpha, const rocsparse_mat_descr descr, const float* ell_val, const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, const float* x, const float* beta, float* y) { return rocsparse_ellmv_template( - handle, trans, m, n, nnz, alpha, descr, ell_val, ell_col_ind, x, beta, y); + handle, trans, m, n, alpha, descr, ell_val, ell_col_ind, ell_width, x, beta, y); } extern "C" rocsparse_status rocsparse_dellmv(rocsparse_handle handle, rocsparse_operation trans, rocsparse_int m, rocsparse_int n, - rocsparse_int nnz, const double* alpha, const rocsparse_mat_descr descr, const double* ell_val, const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, const double* x, const double* beta, double* y) { return rocsparse_ellmv_template( - handle, trans, m, n, nnz, alpha, descr, ell_val, ell_col_ind, x, beta, y); + handle, trans, m, n, alpha, descr, ell_val, ell_col_ind, ell_width, x, beta, y); } diff --git a/library/src/level2/rocsparse_ellmv.hpp b/library/src/level2/rocsparse_ellmv.hpp index 6e5ab3b5..7e596c28 100644 --- a/library/src/level2/rocsparse_ellmv.hpp +++ b/library/src/level2/rocsparse_ellmv.hpp @@ -14,6 +14,36 @@ #include +template +__global__ void ellmvn_kernel_host_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + T alpha, + const rocsparse_int* ell_col_ind, + const T* ell_val, + const T* x, + T beta, + T* y, + rocsparse_index_base idx_base) +{ + ellmvn_device(m, n, ell_width, alpha, ell_col_ind, ell_val, x, beta, y, idx_base); +} + +template +__global__ void ellmvn_kernel_device_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + const T* alpha, + const rocsparse_int* ell_col_ind, + const T* ell_val, + const T* x, + const T* beta, + T* y, + rocsparse_index_base idx_base) +{ + ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y, idx_base); +} + /*! \brief SPARSE Level 2 API \details @@ -33,8 +63,6 @@ @param[in] n number of columns of A. @param[in] - nnz number of non-zero entries of A. - @param[in] alpha scalar alpha. @param[in] descr descriptor of A. @@ -43,6 +71,8 @@ @param[in] ell_col_ind array of nnz elements containing the column indices of A. @param[in] + ell_width ELL width that was pre-computed during format conversion. + @param[in] x array of n elements (op(A) = A) or m elements (op(A) = A^T or op(A) = A^H). @param[in] @@ -57,16 +87,181 @@ rocsparse_status rocsparse_ellmv_template(rocsparse_handle handle, rocsparse_operation trans, rocsparse_int m, rocsparse_int n, - rocsparse_int nnz, const T* alpha, const rocsparse_mat_descr descr, const T* ell_val, const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, const T* x, const T* beta, T* y) { - return rocsparse_status_not_implemented; + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xellmv"), + trans, + m, + n, + *alpha, + (const void*&)descr, + (const void*&)ell_val, + (const void*&)ell_col_ind, + ell_width, + (const void*&)x, + *beta, + (const void*&)y); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xellmv"), + trans, + m, + n, + (const void*&)alpha, + (const void*&)descr, + (const void*&)ell_val, + (const void*&)ell_col_ind, + ell_width, + (const void*&)x, + (const void*&)beta, + (const void*&)y); + } + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(ell_width < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(ell_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(ell_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(beta == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Sanity check + if((m == 0 || n == 0) && ell_width != 0) + { + return rocsparse_status_invalid_size; + } + + // Quick return if possible + if(m == 0 || n == 0 || ell_width == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // Run different ellmv kernels + if(trans == rocsparse_operation_none) + { +#define ELLMVN_DIM 512 + dim3 ellmvn_blocks((m - 1) / ELLMVN_DIM + 1); + dim3 ellmvn_threads(ELLMVN_DIM); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + hipLaunchKernelGGL((ellmvn_kernel_device_pointer), + ellmvn_blocks, + ellmvn_threads, + 0, + stream, + m, + n, + ell_width, + alpha, + ell_col_ind, + ell_val, + x, + beta, + y, + descr->base); + } + else + { + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + + hipLaunchKernelGGL((ellmvn_kernel_host_pointer), + ellmvn_blocks, + ellmvn_threads, + 0, + stream, + m, + n, + ell_width, + *alpha, + ell_col_ind, + ell_val, + x, + *beta, + y, + descr->base); + } +#undef ELLMVN_DIM + } + else + { + // TODO + return rocsparse_status_not_implemented; + } + return rocsparse_status_success; } #endif // ROCSPARSE_ELLMV_HPP diff --git a/library/src/level2/rocsparse_hybmv.hpp b/library/src/level2/rocsparse_hybmv.hpp index 1a5ef94c..92086ea4 100644 --- a/library/src/level2/rocsparse_hybmv.hpp +++ b/library/src/level2/rocsparse_hybmv.hpp @@ -10,41 +10,11 @@ #include "definitions.h" #include "handle.h" #include "utility.h" -#include "ellmv_device.h" #include "rocsparse_coomv.hpp" +#include "rocsparse_ellmv.hpp" #include -template -__global__ void ellmvn_kernel_host_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int ell_width, - T alpha, - const rocsparse_int* ell_col_ind, - const T* ell_val, - const T* x, - T beta, - T* y, - rocsparse_index_base idx_base) -{ - ellmvn_device(m, n, ell_width, alpha, ell_col_ind, ell_val, x, beta, y, idx_base); -} - -template -__global__ void ellmvn_kernel_device_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int ell_width, - const T* alpha, - const rocsparse_int* ell_col_ind, - const T* ell_val, - const T* x, - const T* beta, - T* y, - rocsparse_index_base idx_base) -{ - ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y, idx_base); -} - template rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, rocsparse_operation trans, @@ -186,40 +156,30 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, return rocsparse_status_success; } - // Stream - hipStream_t stream = handle->stream; - // Run different hybmv kernels if(trans == rocsparse_operation_none) { -#define ELLMVN_DIM 512 - dim3 ellmvn_blocks((hyb->m - 1) / ELLMVN_DIM + 1); - dim3 ellmvn_threads(ELLMVN_DIM); - - if(handle->pointer_mode == rocsparse_pointer_mode_device) + // ELL part + if(hyb->ell_nnz > 0) { - // ELL part - if(hyb->ell_nnz > 0) - { - hipLaunchKernelGGL((ellmvn_kernel_device_pointer), - ellmvn_blocks, - ellmvn_threads, - 0, - stream, - hyb->m, - hyb->n, - hyb->ell_width, - alpha, - hyb->ell_col_ind, - (T*)hyb->ell_val, - x, - beta, - y, - descr->base); - } + RETURN_IF_ROCSPARSE_ERROR(rocsparse_ellmv_template(handle, + trans, + hyb->m, + hyb->n, + alpha, + descr, + (T*)hyb->ell_val, + hyb->ell_col_ind, + hyb->ell_width, + x, + beta, + y)); + } - // COO part - if(hyb->coo_nnz > 0) + // COO part + if(hyb->coo_nnz > 0) + { + if(handle->pointer_mode == rocsparse_pointer_mode_device) { // Beta is applied by ELL part, IF ell_nnz > 0 if(hyb->ell_nnz > 0) @@ -262,37 +222,13 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, y)); } } - } - else - { - if(*alpha == 0.0 && *beta == 1.0) + else { - return rocsparse_status_success; - } - - // ELL part - if(hyb->ell_nnz > 0) - { - hipLaunchKernelGGL((ellmvn_kernel_host_pointer), - ellmvn_blocks, - ellmvn_threads, - 0, - stream, - hyb->m, - hyb->n, - hyb->ell_width, - *alpha, - hyb->ell_col_ind, - (T*)hyb->ell_val, - x, - *beta, - y, - descr->base); - } + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } - // COO part - if(hyb->coo_nnz > 0) - { // Beta is applied by ELL part, IF ell_nnz > 0 T coo_beta = (hyb->ell_nnz > 0) ? 1.0 : *beta; @@ -312,7 +248,6 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, } } } -#undef ELLMVN_DIM else { // TODO From 5fa08f336e52d5552531c7250e48b6b35c6af5fe Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 30 May 2018 10:09:49 +0200 Subject: [PATCH 106/304] added asserts to unit check to be able to verify when benchmarking --- clients/common/unit.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index ab6c97bf..ee349418 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -5,6 +5,7 @@ #include "unit.hpp" #include +#include #include #ifdef GOOGLE_TEST @@ -28,6 +29,8 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, float* hCPU, float* hG { #ifdef GOOGLE_TEST ASSERT_FLOAT_EQ(hCPU[i + j], hGPU[i + j]); +#else + assert(hCPU[i + j] == hGPU[i + j]); #endif } } @@ -42,6 +45,8 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, double* hCPU, double* { #ifdef GOOGLE_TEST ASSERT_DOUBLE_EQ(hCPU[i + j], hGPU[i + j]); +#else + assert(hCPU[i + j] == hGPU[i + j]); #endif } } @@ -56,6 +61,8 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int* hCPU, r { #ifdef GOOGLE_TEST ASSERT_EQ(hCPU[i + j], hGPU[i + j]); +#else + assert(hCPU[i + j] == hGPU[i + j]); #endif } } From 383944278df7e3578b613ba2ada8cb49690f3836 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 30 May 2018 10:10:16 +0200 Subject: [PATCH 107/304] tests: added ellmv --- .../rocsparse_template_specialization.cpp | 58 ++- clients/include/rocsparse.hpp | 14 + clients/include/testing_ellmv.hpp | 400 ++++++++++++++++++ clients/include/testing_hybmv.hpp | 4 +- clients/tests/CMakeLists.txt | 1 + clients/tests/test_ellmv.cpp | 68 +++ 6 files changed, 541 insertions(+), 4 deletions(-) create mode 100644 clients/include/testing_ellmv.hpp create mode 100644 clients/tests/test_ellmv.cpp diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index f4bf41aa..7d253fd0 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -108,6 +108,42 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } +template <> +rocsparse_status rocsparse_ellmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + const float* alpha, + const rocsparse_mat_descr descr, + const float* ell_val, + const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, + const float* x, + const float* beta, + float* y) +{ + return rocsparse_sellmv( + handle, trans, m, n, alpha, descr, ell_val, ell_col_ind, ell_width, x, beta, y); +} + +template <> +rocsparse_status rocsparse_ellmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + const double* alpha, + const rocsparse_mat_descr descr, + const double* ell_val, + const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, + const double* x, + const double* beta, + double* y) +{ + return rocsparse_dellmv( + handle, trans, m, n, alpha, descr, ell_val, ell_col_ind, ell_width, x, beta, y); +} + template <> rocsparse_status rocsparse_hybmv(rocsparse_handle handle, rocsparse_operation trans, @@ -146,7 +182,16 @@ rocsparse_status rocsparse_csr2ell(rocsparse_handle handle, float* ell_val, rocsparse_int* ell_col_ind) { - return rocsparse_scsr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + return rocsparse_scsr2ell(handle, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr, + ell_width, + ell_val, + ell_col_ind); } template <> @@ -161,7 +206,16 @@ rocsparse_status rocsparse_csr2ell(rocsparse_handle handle, double* ell_val, rocsparse_int* ell_col_ind) { - return rocsparse_dcsr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + return rocsparse_dcsr2ell(handle, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr, + ell_width, + ell_val, + ell_col_ind); } template <> diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index 13c9e992..4a6ced4f 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -48,6 +48,20 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const T* beta, T* y); +template +rocsparse_status rocsparse_ellmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + const T* alpha, + const rocsparse_mat_descr descr, + const T* ell_val, + const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, + const T* x, + const T* beta, + T* y); + template rocsparse_status rocsparse_hybmv(rocsparse_handle handle, rocsparse_operation trans, diff --git a/clients/include/testing_ellmv.hpp b/clients/include/testing_ellmv.hpp new file mode 100644 index 00000000..e09c375d --- /dev/null +++ b/clients/include/testing_ellmv.hpp @@ -0,0 +1,400 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_ELLMV_HPP +#define TESTING_ELLMV_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +#define ELL_IND_ROW(i, el, m, width) (el) * (m) + (i) +#define ELL_IND_EL(i, el, m, width) (el) + (width) * (i) +#define ELL_IND(i, el, m, width) ELL_IND_ROW(i, el, m, width) + +template +void testing_ellmv_bad_arg(void) +{ + rocsparse_int n = 100; + rocsparse_int m = 100; + rocsparse_int safe_size = 100; + rocsparse_int ell_width = 8; + T alpha = 0.6; + T beta = 0.2; + rocsparse_operation trans = rocsparse_operation_none; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dval || !dcol || !dx || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_ellmv( + handle, trans, m, n, &alpha, descr, dval, dcol_null, ell_width, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dval) + { + T* dval_null = nullptr; + + status = rocsparse_ellmv( + handle, trans, m, n, &alpha, descr, dval_null, dcol, ell_width, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); + } + // testing for(nullptr == dx) + { + T* dx_null = nullptr; + + status = rocsparse_ellmv( + handle, trans, m, n, &alpha, descr, dval, dcol, ell_width, dx_null, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); + } + // testing for(nullptr == dy) + { + T* dy_null = nullptr; + + status = rocsparse_ellmv( + handle, trans, m, n, &alpha, descr, dval, dcol, ell_width, dx, &beta, dy_null); + verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); + } + // testing for(nullptr == d_alpha) + { + T* d_alpha_null = nullptr; + + status = rocsparse_ellmv( + handle, trans, m, n, d_alpha_null, descr, dval, dcol, ell_width, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); + } + // testing for(nullptr == d_beta) + { + T* d_beta_null = nullptr; + + status = rocsparse_ellmv( + handle, trans, m, n, &alpha, descr, dval, dcol, ell_width, dx, d_beta_null, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_ellmv( + handle, trans, m, n, &alpha, descr_null, dval, dcol, ell_width, dx, &beta, dy); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_ellmv( + handle_null, trans, m, n, &alpha, descr, dval, dcol, ell_width, dx, &beta, dy); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_ellmv(Arguments argus) +{ + rocsparse_int safe_size = 100; + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + T h_alpha = argus.alpha; + T h_beta = argus.beta; + rocsparse_operation trans = argus.trans; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + std::unique_ptr test_descr(new descr_struct); + rocsparse_mat_descr descr = test_descr->descr; + + // Set matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); + + // Determine number of non-zero elements + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dval || !dcol || !dx || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dcol || !dval || !dx || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = + rocsparse_ellmv(handle, trans, m, n, &h_alpha, descr, dval, dcol, 0, dx, &h_beta, dy); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hcsr_row_ptr; + std::vector hcoo_row_ind; + std::vector hcol_ind; + std::vector hval; + + // Initial Data on CPU + srand(12345ULL); + if(argus.laplacian) + { + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcol_ind, hval, idx_base); + nnz = hcsr_row_ptr[m]; + } + else + { + if(argus.filename != "") + { + if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval) != + 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base); + } + + // Convert COO to CSR + if(!argus.laplacian) + { + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } + } + } + + // Convert CSR to ELL + rocsparse_int ell_width = 0; + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int row_nnz = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; + ell_width = (row_nnz > ell_width) ? row_nnz : ell_width; + } + + rocsparse_int ell_nnz = ell_width * m; + + std::vector hell_col_ind(ell_nnz); + std::vector hell_val(ell_nnz); + + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int p = 0; + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; ++j) + { + rocsparse_int idx = ELL_IND(i, p, m, ell_width); + hell_val[idx] = hval[j]; + hell_col_ind[idx] = hcol_ind[j]; + ++p; + } + for(rocsparse_int j = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; j < ell_width; ++j) + { + rocsparse_int idx = ELL_IND(i, p, m, ell_width); + hell_val[idx] = static_cast(0); + hell_col_ind[idx] = -1; + ++p; + } + } + + std::vector hx(n); + std::vector hy_1(m); + std::vector hy_2(m); + std::vector hy_gold(m); + + rocsparse_init(hx, 1, n); + rocsparse_init(hy_1, 1, m); + + // copy vector is easy in STL; hy_gold = hx: save a copy in hy_gold which will be output of CPU + hy_2 = hy_1; + hy_gold = hy_1; + + // allocate memory on device + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * ell_nnz), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * n), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto d_beta_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy_1 = (T*)dy_1_managed.get(); + T* dy_2 = (T*)dy_2_managed.get(); + T* d_alpha = (T*)d_alpha_managed.get(); + T* d_beta = (T*)d_beta_managed.get(); + + if(!dval || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dval || !dcol || !dx || !dy_1 || " + "!dy_2 || !d_alpha || !d_beta"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy( + dcol, hell_col_ind.data(), sizeof(rocsparse_int) * ell_nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dval, hell_val.data(), sizeof(T) * ell_nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * n, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_ellmv( + handle, trans, m, n, &h_alpha, descr, dval, dcol, ell_width, dx, &h_beta, dy_1)); + + // ROCSPARSE pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_ellmv( + handle, trans, m, n, d_alpha, descr, dval, dcol, ell_width, dx, d_beta, dy_2)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * m, hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + for(rocsparse_int i = 0; i < m; ++i) + { + hy_gold[i] *= h_beta; + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; + ++j) + { + hy_gold[i] += h_alpha * hval[j] * hx[hcol_ind[j] - idx_base]; + } + } + + cpu_time_used = get_time_us() - cpu_time_used; + + // enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + unit_check_general(1, m, hy_gold.data(), hy_1.data()); + unit_check_general(1, m, hy_gold.data(), hy_2.data()); + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_ellmv( + handle, trans, m, n, &h_alpha, descr, dval, dcol, ell_width, dx, &h_beta, dy_1); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_ellmv( + handle, trans, m, n, &h_alpha, descr, dval, dcol, ell_width, dx, &h_beta, dy_1); + } + + // Convert to miliseconds per call + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + size_t flops = (h_alpha != 1.0) ? 3.0 * nnz : 2.0 * nnz; + flops = (h_beta != 0.0) ? flops + m : flops; + double gpu_gflops = flops / gpu_time_used / 1e6; + size_t memtrans = sizeof(T) * (m + n + ell_nnz); + memtrans += sizeof(rocsparse_int) * ell_nnz; + memtrans = (h_beta != 0.0) ? memtrans + sizeof(T) * m : memtrans; + double bandwidth = memtrans / gpu_time_used / 1e6; + + printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + m, + n, + ell_nnz, + h_alpha, + h_beta, + gpu_gflops, + bandwidth, + gpu_time_used); + } + + return rocsparse_status_success; +} + +#endif // TESTING_ELLMV_HPP diff --git a/clients/include/testing_hybmv.hpp b/clients/include/testing_hybmv.hpp index 22b239ab..1c5f67ad 100644 --- a/clients/include/testing_hybmv.hpp +++ b/clients/include/testing_hybmv.hpp @@ -357,7 +357,7 @@ rocsparse_status testing_hybmv(Arguments argus) double gpu_gflops = flops / gpu_time_used / 1e6; size_t ell_mem = dhyb->ell_nnz * (sizeof(rocsparse_int) + sizeof(T)); size_t coo_mem = dhyb->coo_nnz * (sizeof(rocsparse_int) * 2 + sizeof(T)); - size_t memtrans = 2 * m + ell_mem + coo_mem; + size_t memtrans = (m + n) * sizeof(T) + ell_mem + coo_mem; memtrans = (h_beta != 0.0) ? memtrans + m : memtrans; double bandwidth = memtrans / gpu_time_used / 1e6; @@ -365,7 +365,7 @@ rocsparse_status testing_hybmv(Arguments argus) printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", m, n, - nnz, + dhyb->ell_nnz + dhyb->coo_nnz, h_alpha, h_beta, gpu_gflops, diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index ba0bd62c..711c324d 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -9,6 +9,7 @@ set(ROCSPARSE_TEST_SOURCES test_axpyi.cpp test_coomv.cpp test_csrmv.cpp + test_ellmv.cpp test_hybmv.cpp test_csr2coo.cpp test_csr2ell.cpp diff --git a/clients/tests/test_ellmv.cpp b/clients/tests/test_ellmv.cpp new file mode 100644 index 00000000..c5d8da4c --- /dev/null +++ b/clients/tests/test_ellmv.cpp @@ -0,0 +1,68 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_ellmv.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple ellmv_tuple; + +int ell_M_range[] = {-1, 0, 10, 500, 7111, 10000}; +int ell_N_range[] = {-3, 0, 33, 842, 4441, 10000}; + +std::vector ell_alpha_range = {2.0, 3.0}; +std::vector ell_beta_range = {0.0, 0.6}; + +base ell_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +class parameterized_ellmv : public testing::TestWithParam +{ + protected: + parameterized_ellmv() {} + virtual ~parameterized_ellmv() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_ellmv_arguments(ellmv_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.beta = std::get<3>(tup); + arg.idx_base = std::get<4>(tup); + arg.timing = 0; + return arg; +} + +TEST(ellmv_bad_arg, ellmv_float) { testing_ellmv_bad_arg(); } + +TEST_P(parameterized_ellmv, ellmv_float) +{ + Arguments arg = setup_ellmv_arguments(GetParam()); + + rocsparse_status status = testing_ellmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_ellmv, ellmv_double) +{ + Arguments arg = setup_ellmv_arguments(GetParam()); + + rocsparse_status status = testing_ellmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(ellmv, + parameterized_ellmv, + testing::Combine(testing::ValuesIn(ell_M_range), + testing::ValuesIn(ell_N_range), + testing::ValuesIn(ell_alpha_range), + testing::ValuesIn(ell_beta_range), + testing::ValuesIn(ell_idxbase_range))); From 45563bcb1aa1354f9697fed54ad5fa1428a8fa74 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 30 May 2018 10:10:25 +0200 Subject: [PATCH 108/304] benchmarks: added ellmv, csr2ell --- clients/benchmarks/client.cpp | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 463f4788..a59bed4e 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -6,9 +6,11 @@ #include "rocsparse.hpp" #include "testing_coomv.hpp" #include "testing_csrmv.hpp" +#include "testing_ellmv.hpp" #include "testing_hybmv.hpp" #include "testing_axpyi.hpp" #include "testing_csr2coo.hpp" +#include "testing_csr2ell.hpp" #include "testing_csr2hyb.hpp" #include "testing_coo2csr.hpp" @@ -65,14 +67,14 @@ int main(int argc, char* argv[]) ("function,f", po::value(&function)->default_value("axpyi"), - "SPARSE function to test. Options: axpyi, coomv, csrmv, hybmv, csr2coo, csr2hyb, " - "coo2csr") + "SPARSE function to test. Options: axpyi, coomv, csrmv, ellmv, hybmv, csr2coo, " + "csr2ell, csr2hyb, coo2csr") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") ("verify,v", - po::value(&argus.norm_check)->default_value(0), + po::value(&argus.unit_check)->default_value(0), "Validate GPU results with CPU? 0 = No, 1 = Yes (default: No)") ("iters,i", @@ -142,6 +144,13 @@ int main(int argc, char* argv[]) else if(precision == 'd') testing_csrmv(argus); } + else if(function == "ellmv") + { + if(precision == 's') + testing_ellmv(argus); + else if(precision == 'd') + testing_ellmv(argus); + } else if(function == "hybmv") { if(precision == 's') @@ -153,6 +162,13 @@ int main(int argc, char* argv[]) { testing_csr2coo(argus); } + else if(function == "csr2ell") + { + if(precision == 's') + testing_csr2ell(argus); + else if(precision == 'd') + testing_csr2ell(argus); + } else if(function == "csr2hyb") { if(precision == 's') From 45b55c1bcf6f71c33b2f035b1f38e23d98b893be Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 30 May 2018 10:57:48 +0200 Subject: [PATCH 109/304] comments & clangformat --- library/include/rocsparse-functions.h | 174 +++++++++++++++---- library/include/rocsparse-types.h | 52 +++--- library/src/conversion/coo2csr_device.h | 2 + library/src/conversion/csr2coo_device.h | 1 + library/src/conversion/csr2ell_device.h | 9 +- library/src/conversion/csr2hyb_device.h | 8 + library/src/conversion/rocsparse_csr2ell.cpp | 30 +++- library/src/level1/axpyi_device.h | 1 + library/src/level1/rocsparse_axpyi.hpp | 28 +-- library/src/level2/coomv_device.h | 3 + library/src/level2/ellmv_device.h | 1 + library/src/level2/rocsparse_coomv.hpp | 40 ----- library/src/level2/rocsparse_csrmv.hpp | 41 ----- library/src/level2/rocsparse_ellmv.hpp | 38 ---- library/src/level2/rocsparse_hybmv.hpp | 2 +- 15 files changed, 221 insertions(+), 209 deletions(-) diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 00b4a48a..7d3a5fd1 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -302,7 +302,7 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, \details ellmv multiplies the dense vector x[i] with scalar alpha and sparse m x n - matrix A that is defined in ELL storage format and add the result to y[i] + matrix A that is defined in ELL storage format and adds the result to y[i] that is multiplied by beta, for i = 1 , … , n y := alpha * op(A) * x + beta * y, @@ -322,10 +322,12 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, descr descriptor of A. @param[in] ell_val array of nnz elements of A. + Padded elements should be set to 0. @param[in] ell_col_ind array of nnz elements containing the column indices of A. + Padded column indices should be set to -1. @param[in] - ell_width ELL width that was pre-computed during format conversion. + ell_width number of non-zero elements per row in ELL storage format. @param[in] x array of n elements (op(A) = A) or m elements (op(A) = A^T or op(A) = A^H). @@ -364,11 +366,41 @@ rocsparse_status rocsparse_dellmv(rocsparse_handle handle, const double* beta, double* y); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_sellmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + const rocsparse_float_complex* alpha, + const rocsparse_mat_descr descr, + const rocsparse_float_complex* ell_val, + const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, + const rocsparse_float_complex* x, + const rocsparse_float_complex* beta, + rocsparse_float_complex* y); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_sellmv(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + const rocsparse_double_complex* alpha, + const rocsparse_mat_descr descr, + const rocsparse_double_complex* ell_val, + const rocsparse_int* ell_col_ind, + rocsparse_int ell_width, + const rocsparse_double_complex* x, + const rocsparse_double_complex* beta, + rocsparse_double_complex* y); +*/ + /*! \brief SPARSE Level 2 API \details hybmv multiplies the dense vector x[i] with scalar alpha and sparse m x n - matrix A that is defined in HYB storage format and add the result to y[i] + matrix A that is defined in HYB storage format and adds the result to y[i] that is multiplied by beta, for i = 1 , … , n y := alpha * op(A) * x + beta * y, @@ -434,6 +466,7 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, const rocsparse_double_complex* beta, rocsparse_double_complex* y); */ + /* * =========================================================================== * level 3 SPARSE @@ -479,9 +512,24 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, /*! \brief SPARSE Format Conversions API \details - csr2ell converts a CSR matrix into an ELL matrix. + csr2ell_width computes the maximum of the per row non-zeros over all + rows, the ELL width, for a given CSR matrix. - // TODO + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + csr_descr descriptor of the CSR matrix. + @param[in] + csr_row_ptr array of m+1 elements that point to the start of every row + of A. + @param[in] + ell_descr descriptor of the ELL matrix. + @param[out] + ell_width pointer to the number of non-zero elements per row in ELL + storage format. ********************************************************************/ ROCSPARSE_EXPORT @@ -492,8 +540,41 @@ rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, const rocsparse_mat_descr ell_descr, rocsparse_int* ell_width); -// TODO descr. text +/*! \brief SPARSE Format Conversions API + \details + csr2ell converts a CSR matrix into an ELL matrix. It is assumed, that + ell_val and ell_col_ind are allocated. Allocation size is computed by + the number of rows times the number of ELL non-zero elements per row. + The number of ELL non-zero elements per row can be obtained by calling + csr2ell_width routine. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + csr_descr descriptor of the CSR matrix. + @param[in] + csr_val array of nnz elements of A. + @param[in] + csr_row_ptr array of m+1 elements that point to the start + of every row of A. + @param[in] + csr_col_ind array of nnz elements containing the column indices of A. + @param[in] + ell_descr descriptor of the ELL matrix. + @param[in] + ell_width number of non-zero elements per row in ELL storage format. + @param[out] + ell_val array of nnz elements of A. Padded elements should be set + to 0. + @param[out] + ell_col_ind array of nnz elements containing the column indices of A. + Padded column indices should be set to -1. + + ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsr2ell(rocsparse_handle handle, rocsparse_int m, @@ -518,35 +599,31 @@ rocsparse_status rocsparse_dcsr2ell(rocsparse_handle handle, double* ell_val, rocsparse_int* ell_col_ind); -/*! \brief SPARSE Format Conversions API - - \details - coo2csr converts the COO array containing the row indices into a - CSR array of row offset pointers. - - @param[in] - handle rocsparse_handle. - handle to the rocsparse library context queue. - @param[in] - coo_row_ind array of nnz elements containing the row indices of A. - @param[in] - nnz number of non-zero entries of the sparse matrix A. - @param[in] - m number of rows of the sparse matrix A. - @param[out] - csr_row_ptr array of m+1 elements that point to the start of every row - of A. - @param[in] - idx_base rocsparse_index_base_zero or rocsparse_index_base_one. +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsr2ell(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const rocsparse_float_complex* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + rocsparse_float_complex* ell_val, + rocsparse_int* ell_col_ind); - ********************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, - const rocsparse_int* coo_row_ind, - rocsparse_int nnz, - rocsparse_int m, - rocsparse_int* csr_row_ptr, - rocsparse_index_base idx_base); +rocsparse_status rocsparse_scsr2ell(rocsparse_handle handle, + rocsparse_int m, + const rocsparse_mat_descr csr_descr, + const rocsparse_double_complex* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + rocsparse_double_complex* ell_val, + rocsparse_int* ell_col_ind); +*/ /*! \brief SPARSE Format Conversions API @@ -631,6 +708,37 @@ rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type); */ + +/*! \brief SPARSE Format Conversions API + + \details + coo2csr converts the COO array containing the row indices into a + CSR array of row offset pointers. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + coo_row_ind array of nnz elements containing the row indices of A. + @param[in] + nnz number of non-zero entries of the sparse matrix A. + @param[in] + m number of rows of the sparse matrix A. + @param[out] + csr_row_ptr array of m+1 elements that point to the start of every row + of A. + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, + const rocsparse_int* coo_row_ind, + rocsparse_int nnz, + rocsparse_int m, + rocsparse_int* csr_row_ptr, + rocsparse_index_base idx_base); + #ifdef __cplusplus } #endif diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index 8f696068..1aabae39 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -41,52 +41,52 @@ typedef enum rocsparse_operation_ { /*! \brief Used to specify the matrix index base. */ typedef enum rocsparse_index_base_ { - rocsparse_index_base_zero = 0, - rocsparse_index_base_one = 1 + rocsparse_index_base_zero = 0, /**< zero based indexing. */ + rocsparse_index_base_one = 1 /**< one based indexing. */ } rocsparse_index_base; /*! \brief Used to specify the matrix type. */ typedef enum rocsparse_matrix_type_ { - rocsparse_matrix_type_general = 0, - rocsparse_matrix_type_symmetric = 1, - rocsparse_matrix_type_hermitian = 2, - rocsparse_matrix_type_triangular = 3 + rocsparse_matrix_type_general = 0, /**< general matrix type. */ + rocsparse_matrix_type_symmetric = 1, /**< symmetric matrix type. */ + rocsparse_matrix_type_hermitian = 2, /**< hermitian matrix type. */ + rocsparse_matrix_type_triangular = 3 /**< triangular matrix type. */ } rocsparse_matrix_type; /*! \brief HYB matrix partition type. */ typedef enum rocsparse_hyb_partition_ { - rocsparse_hyb_partition_auto = 0, - rocsparse_hyb_partition_user = 1, - rocsparse_hyb_partition_max = 2 + rocsparse_hyb_partition_auto = 0, /**< automatically decide on ELL nnz per row. */ + rocsparse_hyb_partition_user = 1, /**< user given ELL nnz per row. */ + rocsparse_hyb_partition_max = 2 /**< max ELL nnz per row, no COO part. */ } rocsparse_hyb_partition; /* ==================================================================================== */ /** - * @brief rocsparse status codes definition + * @brief rocsparse status codes definition. */ typedef enum rocsparse_status_ { - rocsparse_status_success = 0, /**< success */ - rocsparse_status_invalid_handle = 1, /**< handle not initialized, invalid or null */ - rocsparse_status_not_implemented = 2, /**< function is not implemented */ - rocsparse_status_invalid_pointer = 3, /**< invalid pointer parameter */ - rocsparse_status_invalid_size = 4, /**< invalid size parameter */ - rocsparse_status_memory_error = 5, /**< failed memory allocation, copy, dealloc */ - rocsparse_status_internal_error = 6, /**< other internal library failure */ - rocsparse_status_invalid_value = 7, /**< invalid value parameter */ - rocsparse_status_arch_mismatch = 8 /**< device arch is not supported */ + rocsparse_status_success = 0, /**< success. */ + rocsparse_status_invalid_handle = 1, /**< handle not initialized, invalid or null. */ + rocsparse_status_not_implemented = 2, /**< function is not implemented. */ + rocsparse_status_invalid_pointer = 3, /**< invalid pointer parameter. */ + rocsparse_status_invalid_size = 4, /**< invalid size parameter. */ + rocsparse_status_memory_error = 5, /**< failed memory allocation, copy, dealloc. */ + rocsparse_status_internal_error = 6, /**< other internal library failure. */ + rocsparse_status_invalid_value = 7, /**< invalid value parameter. */ + rocsparse_status_arch_mismatch = 8 /**< device arch is not supported. */ } rocsparse_status; -/*! \brief Indicates the pointer is device pointer or host pointer */ +/*! \brief Indicates the pointer is device pointer or host pointer. */ typedef enum rocsparse_pointer_mode_ { - rocsparse_pointer_mode_host = 0, - rocsparse_pointer_mode_device = 1 + rocsparse_pointer_mode_host = 0, /**< scalar pointers are in host memory. */ + rocsparse_pointer_mode_device = 1 /**< scalar pointers are in device memory. */ } rocsparse_pointer_mode; -/*! \brief Indicates if layer is active with bitmask*/ +/*! \brief Indicates if layer is active with bitmask.*/ typedef enum rocsparse_layer_mode { - rocsparse_layer_mode_none = 0b0000000000, - rocsparse_layer_mode_log_trace = 0b0000000001, - rocsparse_layer_mode_log_bench = 0b0000000010, + rocsparse_layer_mode_none = 0b0000000000, /**< layer is not active. */ + rocsparse_layer_mode_log_trace = 0b0000000001, /**< layer is in logging mode. */ + rocsparse_layer_mode_log_bench = 0b0000000010, /**< layer is in benchmarking mode. */ } rocsparse_layer_mode; #ifdef __cplusplus diff --git a/library/src/conversion/coo2csr_device.h b/library/src/conversion/coo2csr_device.h index c63e1912..5a294183 100644 --- a/library/src/conversion/coo2csr_device.h +++ b/library/src/conversion/coo2csr_device.h @@ -8,6 +8,7 @@ #include +// Compute lower bound by binary search __device__ rocsparse_int lower_bound(const rocsparse_int* arr, rocsparse_int key, rocsparse_int low, @@ -31,6 +32,7 @@ __device__ rocsparse_int lower_bound(const rocsparse_int* arr, return lower_bound(arr, key, low, high); } +// COO to CSR matrix conversion kernel __global__ void coo2csr_kernel(rocsparse_int m, rocsparse_int nnz, const rocsparse_int* coo_row_ind, diff --git a/library/src/conversion/csr2coo_device.h b/library/src/conversion/csr2coo_device.h index 4cd6d2cd..410d755e 100644 --- a/library/src/conversion/csr2coo_device.h +++ b/library/src/conversion/csr2coo_device.h @@ -8,6 +8,7 @@ #include +// CSR to COO matrix conversion kernel template __global__ void csr2coo_kernel(rocsparse_int m, const rocsparse_int* csr_row_ptr, diff --git a/library/src/conversion/csr2ell_device.h b/library/src/conversion/csr2ell_device.h index 7c709fff..b373d3e5 100644 --- a/library/src/conversion/csr2ell_device.h +++ b/library/src/conversion/csr2ell_device.h @@ -6,10 +6,11 @@ #ifndef CSR2ELL_DEVICE_H #define CSR2ELL_DEVICE_H -//#include "handle.h" +#include "handle.h" #include +// Block reduce kernel computing maximum entry in data array template __device__ void ell_width_reduce(rocsparse_int tid, rocsparse_int* data) { @@ -26,6 +27,8 @@ __device__ void ell_width_reduce(rocsparse_int tid, rocsparse_int* data) } } +// Compute non-zero entries per CSR row and do a block reduction over the maximum +// Store result in a workspace for final reduction on part2 template __global__ void ell_width_kernel_part1(rocsparse_int m, const rocsparse_int* csr_row_ptr, rocsparse_int* workspace) @@ -52,6 +55,7 @@ ell_width_kernel_part1(rocsparse_int m, const rocsparse_int* csr_row_ptr, rocspa } } +// Part2 kernel for final reduction over the maximum CSR nnz row entries template __global__ void ell_width_kernel_part2(rocsparse_int m, rocsparse_int* workspace) { @@ -88,6 +92,7 @@ __global__ void ell_width_kernel_part2(rocsparse_int m, rocsparse_int* workspace } } +// CSR to ELL format conversion kernel template __global__ void csr2ell_kernel(rocsparse_int m, const T* csr_val, @@ -111,7 +116,7 @@ __global__ void csr2ell_kernel(rocsparse_int m, rocsparse_int row_begin = csr_row_ptr[ai] - csr_idx_base; rocsparse_int row_end = csr_row_ptr[ai + 1] - csr_idx_base; - // Fill HYB matrix + // Fill ELL matrix for(rocsparse_int aj = row_begin; aj < row_end; ++aj) { if(p >= ell_width) diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index 3c4ba569..0b9ab1cf 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -10,6 +10,7 @@ #include +// Block reduce kernel computing sum template __device__ void sum_reduce(rocsparse_int tid, rocsparse_int* data) { @@ -26,6 +27,9 @@ __device__ void sum_reduce(rocsparse_int tid, rocsparse_int* data) } } +// Compute non-zero entries per CSR row and do a block reduction over the sum +// to obtain the number of COO part non-zero entries and COO nnz per row. +// Store the result in a workspace for final reduction on part2 template __global__ void hyb_coo_nnz_part1(rocsparse_int m, rocsparse_int ell_width, @@ -67,6 +71,7 @@ __global__ void hyb_coo_nnz_part1(rocsparse_int m, } } +// Part2 kernel for final reduction over the sum of COO non-zero entries template __global__ void hyb_coo_nnz_part2(rocsparse_int m, rocsparse_int* workspace) { @@ -103,6 +108,7 @@ __global__ void hyb_coo_nnz_part2(rocsparse_int m, rocsparse_int* workspace) } } +// CSR to HYB format conversion kernel template __global__ void csr2hyb_kernel(rocsparse_int m, const T* csr_val, @@ -135,12 +141,14 @@ __global__ void csr2hyb_kernel(rocsparse_int m, { if(p < ell_width) { + // Fill ELL part rocsparse_int idx = ELL_IND(ai, p++, m, ell_width); ell_col_ind[idx] = csr_col_ind[aj]; ell_val[idx] = csr_val[aj]; } else { + // Fill COO part coo_row_ind[coo_idx] = ai + idx_base; coo_col_ind[coo_idx] = csr_col_ind[aj]; coo_val[coo_idx] = csr_val[aj]; diff --git a/library/src/conversion/rocsparse_csr2ell.cpp b/library/src/conversion/rocsparse_csr2ell.cpp index 1263e49c..8ff18d42 100644 --- a/library/src/conversion/rocsparse_csr2ell.cpp +++ b/library/src/conversion/rocsparse_csr2ell.cpp @@ -94,7 +94,7 @@ extern "C" rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, hipStream_t stream = handle->stream; - // Determine ELL width +// Determine ELL width #define CSR2ELL_DIM 512 // Workspace size @@ -125,11 +125,13 @@ extern "C" rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, // Copy ELL width back to host, if handle says so if(handle->pointer_mode == rocsparse_pointer_mode_device) { - RETURN_IF_HIP_ERROR(hipMemcpy(ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR( + hipMemcpy(ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToDevice)); } else { - RETURN_IF_HIP_ERROR(hipMemcpy(ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR( + hipMemcpy(ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); } return rocsparse_status_success; @@ -146,7 +148,16 @@ extern "C" rocsparse_status rocsparse_scsr2ell(rocsparse_handle handle, float* ell_val, rocsparse_int* ell_col_ind) { - return rocsparse_csr2ell_template(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + return rocsparse_csr2ell_template(handle, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr, + ell_width, + ell_val, + ell_col_ind); } extern "C" rocsparse_status rocsparse_dcsr2ell(rocsparse_handle handle, @@ -160,5 +171,14 @@ extern "C" rocsparse_status rocsparse_dcsr2ell(rocsparse_handle handle, double* ell_val, rocsparse_int* ell_col_ind) { - return rocsparse_csr2ell_template(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + return rocsparse_csr2ell_template(handle, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr, + ell_width, + ell_val, + ell_col_ind); } diff --git a/library/src/level1/axpyi_device.h b/library/src/level1/axpyi_device.h index 364f26d7..474505ed 100644 --- a/library/src/level1/axpyi_device.h +++ b/library/src/level1/axpyi_device.h @@ -8,6 +8,7 @@ #include +// y = a * x + y kernel for sparse x and dense y template __device__ void axpyi_device(rocsparse_int nnz, T alpha, diff --git a/library/src/level1/rocsparse_axpyi.hpp b/library/src/level1/rocsparse_axpyi.hpp index 5688111a..30d1e254 100644 --- a/library/src/level1/rocsparse_axpyi.hpp +++ b/library/src/level1/rocsparse_axpyi.hpp @@ -32,32 +32,14 @@ __global__ void axpyi_kernel_device_scalar(rocsparse_int nnz, T* y, rocsparse_index_base idx_base) { + if(*alpha == static_cast(0)) + { + return; + } + axpyi_device(nnz, *alpha, x_val, x_ind, y, idx_base); } -/*! \brief SPARSE Level 1 API - - \details - axpyi compute y := alpha * x + y - - @param[in] - handle rocsparse_handle. - handle to the rocsparse library context queue. - @param[in] - nnz number of non-zero entries in x - if nnz <= 0 quick return with rocsparse_status_success - @param[in] - alpha scalar alpha. - @param[in] - x_val pointer storing vector x non-zero values on the GPU. - @param[in] - x_ind pointer storing vector x non-zero value indices on the GPU. - @param[inout] - y pointer storing y on the GPU. - @param[in] - idx_base specifies the index base. - - ********************************************************************/ template rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, rocsparse_int nnz, diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h index fb3c9039..a2835b6e 100644 --- a/library/src/level2/coomv_device.h +++ b/library/src/level2/coomv_device.h @@ -8,6 +8,7 @@ #include +// Scale kernel for beta != 1.0 template __global__ void coomv_scale(rocsparse_int size, T scalar, T* data) { @@ -158,6 +159,7 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, } } +// Segmented block reduction kernel template static __device__ void segmented_blockreduce(const rocsparse_int* rows, T* vals) { @@ -180,6 +182,7 @@ static __device__ void segmented_blockreduce(const rocsparse_int* rows, T* vals) } } +// Do the final block reduction of the block reduction buffers back into global memory template __global__ void coomvn_general_block_reduce(rocsparse_int nnz, const rocsparse_int* row_block_red, diff --git a/library/src/level2/ellmv_device.h b/library/src/level2/ellmv_device.h index e45e095e..9d9e19b5 100644 --- a/library/src/level2/ellmv_device.h +++ b/library/src/level2/ellmv_device.h @@ -6,6 +6,7 @@ #include +// ELL SpMV for general, non-transposed matrices template static __device__ void ellmvn_device(rocsparse_int m, rocsparse_int n, diff --git a/library/src/level2/rocsparse_coomv.hpp b/library/src/level2/rocsparse_coomv.hpp index dbc43f2d..ae5b45f9 100644 --- a/library/src/level2/rocsparse_coomv.hpp +++ b/library/src/level2/rocsparse_coomv.hpp @@ -66,46 +66,6 @@ __global__ void coomvn_warp_device_pointer(rocsparse_int nnz, idx_base); } -/*! \brief SPARSE Level 2 API - - \details - coomv multiplies the dense vector x[i] with scalar alpha and sparse m x n - matrix A that is defined in COO storage format and add the result to y[i] - that is multiplied by beta, for i = 1 , … , n - - y := alpha * op(A) * x + beta * y, - - @param[in] - handle rocsparse_handle. - handle to the rocsparse library context queue. - @param[in] - trans operation type of A. - @param[in] - m number of rows of A. - @param[in] - n number of columns of A. - @param[in] - nnz number of non-zero entries of A. - @param[in] - alpha scalar alpha. - @param[in] - descr descriptor of A. - @param[in] - coo_val array of nnz elements of A. - @param[in] - coo_row_ind array of nnz elements containing the row indices of A. - @param[in] - coo_col_ind array of nnz elements containing the column indices of A. - @param[in] - x array of n elements (op(A) = A) or m elements (op(A) = A^T or - op(A) = A^H). - @param[in] - beta scalar beta. - @param[inout] - y array of m elements (op(A) = A) or n elements (op(A) = A^T or - op(A) = A^H). - - ********************************************************************/ template rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, rocsparse_operation trans, diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index 8853a7fe..f2ad0819 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -43,47 +43,6 @@ __global__ void csrmvn_kernel_device_pointer(rocsparse_int m, m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); } -/*! \brief SPARSE Level 2 API - - \details - csrmv multiplies the dense vector x[i] with scalar alpha and sparse m x n - matrix A that is defined in CSR storage format and add the result to y[i] - that is multiplied by beta, for i = 1 , … , n - - y := alpha * op(A) * x + beta * y, - - @param[in] - handle rocsparse_handle. - handle to the rocsparse library context queue. - @param[in] - trans operation type of A. - @param[in] - m number of rows of A. - @param[in] - n number of columns of A. - @param[in] - nnz number of non-zero entries of A. - @param[in] - alpha scalar alpha. - @param[in] - descr descriptor of A. - @param[in] - csr_val array of nnz elements of A. - @param[in] - csr_row_ptr array of m+1 elements that point to the start - of every row of A. - @param[in] - csr_col_ind array of nnz elements containing the column indices of A. - @param[in] - x array of n elements (op(A) = A) or m elements (op(A) = A^T or - op(A) = A^H). - @param[in] - beta scalar beta. - @param[inout] - y array of m elements (op(A) = A) or n elements (op(A) = A^T or - op(A) = A^H). - - ********************************************************************/ template rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, rocsparse_operation trans, diff --git a/library/src/level2/rocsparse_ellmv.hpp b/library/src/level2/rocsparse_ellmv.hpp index 7e596c28..05adc1d3 100644 --- a/library/src/level2/rocsparse_ellmv.hpp +++ b/library/src/level2/rocsparse_ellmv.hpp @@ -44,44 +44,6 @@ __global__ void ellmvn_kernel_device_pointer(rocsparse_int m, ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y, idx_base); } -/*! \brief SPARSE Level 2 API - - \details - ellmv multiplies the dense vector x[i] with scalar alpha and sparse m x n - matrix A that is defined in ELL storage format and add the result to y[i] - that is multiplied by beta, for i = 1 , … , n - - y := alpha * op(A) * x + beta * y, - - @param[in] - handle rocsparse_handle. - handle to the rocsparse library context queue. - @param[in] - trans operation type of A. - @param[in] - m number of rows of A. - @param[in] - n number of columns of A. - @param[in] - alpha scalar alpha. - @param[in] - descr descriptor of A. - @param[in] - ell_val array of nnz elements of A. - @param[in] - ell_col_ind array of nnz elements containing the column indices of A. - @param[in] - ell_width ELL width that was pre-computed during format conversion. - @param[in] - x array of n elements (op(A) = A) or m elements (op(A) = A^T or - op(A) = A^H). - @param[in] - beta scalar beta. - @param[inout] - y array of m elements (op(A) = A) or n elements (op(A) = A^T or - op(A) = A^H). - - ********************************************************************/ template rocsparse_status rocsparse_ellmv_template(rocsparse_handle handle, rocsparse_operation trans, diff --git a/library/src/level2/rocsparse_hybmv.hpp b/library/src/level2/rocsparse_hybmv.hpp index 92086ea4..7b9ec378 100644 --- a/library/src/level2/rocsparse_hybmv.hpp +++ b/library/src/level2/rocsparse_hybmv.hpp @@ -13,7 +13,7 @@ #include "rocsparse_coomv.hpp" #include "rocsparse_ellmv.hpp" -#include +#include template rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, From b78b0a53b18ee29628fce091a2ac14620e370825 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 1 Jun 2018 10:52:39 +0200 Subject: [PATCH 110/304] added sparse level1 placeholders --- library/src/CMakeLists.txt | 12 ++++++ library/src/level1/dotci_device.h | 0 library/src/level1/doti_device.h | 0 library/src/level1/gthr_device.h | 0 library/src/level1/gthrz_device.h | 0 library/src/level1/rocsparse_dotci.cpp | 35 ++++++++++++++++ library/src/level1/rocsparse_dotci.hpp | 28 +++++++++++++ library/src/level1/rocsparse_doti.cpp | 57 ++++++++++++++++++++++++++ library/src/level1/rocsparse_doti.hpp | 28 +++++++++++++ library/src/level1/rocsparse_gthr.cpp | 32 +++++++++++++++ library/src/level1/rocsparse_gthr.hpp | 27 ++++++++++++ library/src/level1/rocsparse_gthrz.cpp | 32 +++++++++++++++ library/src/level1/rocsparse_gthrz.hpp | 27 ++++++++++++ library/src/level1/rocsparse_roti.cpp | 36 ++++++++++++++++ library/src/level1/rocsparse_roti.hpp | 29 +++++++++++++ library/src/level1/rocsparse_sctr.cpp | 32 +++++++++++++++ library/src/level1/rocsparse_sctr.hpp | 27 ++++++++++++ library/src/level1/roti_device.h | 0 library/src/level1/sctr_device.h | 0 19 files changed, 402 insertions(+) create mode 100644 library/src/level1/dotci_device.h create mode 100644 library/src/level1/doti_device.h create mode 100644 library/src/level1/gthr_device.h create mode 100644 library/src/level1/gthrz_device.h create mode 100644 library/src/level1/rocsparse_dotci.cpp create mode 100644 library/src/level1/rocsparse_dotci.hpp create mode 100644 library/src/level1/rocsparse_doti.cpp create mode 100644 library/src/level1/rocsparse_doti.hpp create mode 100644 library/src/level1/rocsparse_gthr.cpp create mode 100644 library/src/level1/rocsparse_gthr.hpp create mode 100644 library/src/level1/rocsparse_gthrz.cpp create mode 100644 library/src/level1/rocsparse_gthrz.hpp create mode 100644 library/src/level1/rocsparse_roti.cpp create mode 100644 library/src/level1/rocsparse_roti.hpp create mode 100644 library/src/level1/rocsparse_sctr.cpp create mode 100644 library/src/level1/rocsparse_sctr.hpp create mode 100644 library/src/level1/roti_device.h create mode 100644 library/src/level1/sctr_device.h diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index df839cf5..d79dec8b 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -7,11 +7,23 @@ set(rocsparse_source src/handle.cpp src/status.cpp src/rocsparse_auxiliary.cpp + +# Level1 src/level1/rocsparse_axpyi.cpp + src/level1/rocsparse_doti.cpp + src/level1/rocsparse_dotci.cpp + src/level1/rocsparse_gthr.cpp + src/level1/rocsparse_gthrz.cpp + src/level1/rocsparse_roti.cpp + src/level1/rocsparse_sctr.cpp + +# Level2 src/level2/rocsparse_coomv.cpp src/level2/rocsparse_csrmv.cpp src/level2/rocsparse_ellmv.cpp src/level2/rocsparse_hybmv.cpp + +# Conversion src/conversion/rocsparse_csr2coo.cpp src/conversion/rocsparse_csr2ell.cpp src/conversion/rocsparse_csr2hyb.cpp diff --git a/library/src/level1/dotci_device.h b/library/src/level1/dotci_device.h new file mode 100644 index 00000000..e69de29b diff --git a/library/src/level1/doti_device.h b/library/src/level1/doti_device.h new file mode 100644 index 00000000..e69de29b diff --git a/library/src/level1/gthr_device.h b/library/src/level1/gthr_device.h new file mode 100644 index 00000000..e69de29b diff --git a/library/src/level1/gthrz_device.h b/library/src/level1/gthrz_device.h new file mode 100644 index 00000000..e69de29b diff --git a/library/src/level1/rocsparse_dotci.cpp b/library/src/level1/rocsparse_dotci.cpp new file mode 100644 index 00000000..c5226404 --- /dev/null +++ b/library/src/level1/rocsparse_dotci.cpp @@ -0,0 +1,35 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_dotci.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ +/* +extern "C" rocsparse_status rocsparse_cdotci(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_float_complex* x_val, + const rocsparse_int* x_ind, + const rocsparse_float_complex* y, + rocsparse_float_complex* result, + rocsparse_index_base idx_base) +{ + return rocsparse_dotci_template(handle, nnz, x_val, x_ind, y, result, idx_base); +} + +extern "C" rocsparse_status rocsparse_zdotci(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_double_complex* x_val, + const rocsparse_int* x_ind, + const rocsparse_double_complex* y, + rocsparse_double_complex* result, + rocsparse_index_base idx_base) +{ + return rocsparse_dotci_template(handle, nnz, x_val, x_ind, y, result, idx_base); +} +*/ diff --git a/library/src/level1/rocsparse_dotci.hpp b/library/src/level1/rocsparse_dotci.hpp new file mode 100644 index 00000000..8d642e7b --- /dev/null +++ b/library/src/level1/rocsparse_dotci.hpp @@ -0,0 +1,28 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_DOTCI_HPP +#define ROCSPARSE_DOTCI_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "dotci_device.h" + +#include + +template +rocsparse_status rocsparse_dotci_template(rocsparse_handle handle, + rocsparse_int nnz, + const T* x_val, + const rocsparse_int* x_ind, + const T* y, + T* result, + rocsparse_index_base idx_base) +{ + return rocsparse_status_not_implemented; +} + +#endif // ROCSPARSE_DOTCI_HPP diff --git a/library/src/level1/rocsparse_doti.cpp b/library/src/level1/rocsparse_doti.cpp new file mode 100644 index 00000000..5b2e4703 --- /dev/null +++ b/library/src/level1/rocsparse_doti.cpp @@ -0,0 +1,57 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_doti.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_sdoti(rocsparse_handle handle, + rocsparse_int nnz, + const float* x_val, + const rocsparse_int* x_ind, + const float* y, + float* result, + rocsparse_index_base idx_base) +{ + return rocsparse_doti_template(handle, nnz, x_val, x_ind, y, result, idx_base); +} + +extern "C" rocsparse_status rocsparse_ddoti(rocsparse_handle handle, + rocsparse_int nnz, + const double* x_val, + const rocsparse_int* x_ind, + const double* y, + double* result, + rocsparse_index_base idx_base) +{ + return rocsparse_doti_template(handle, nnz, x_val, x_ind, y, result, idx_base); +} +/* +extern "C" rocsparse_status rocsparse_sdoti(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_float_complex* x_val, + const rocsparse_int* x_ind, + const rocsparse_float_complex* y, + rocsparse_float_complex* result, + rocsparse_index_base idx_base) +{ + return rocsparse_doti_template(handle, nnz, x_val, x_ind, y, result, idx_base); +} + +extern "C" rocsparse_status rocsparse_ddoti(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_double_complex* x_val, + const rocsparse_int* x_ind, + const rocsparse_double_complex* y, + rocsparse_double_complex* result, + rocsparse_index_base idx_base) +{ + return rocsparse_doti_template(handle, nnz, x_val, x_ind, y, result, idx_base); +} +*/ diff --git a/library/src/level1/rocsparse_doti.hpp b/library/src/level1/rocsparse_doti.hpp new file mode 100644 index 00000000..cb47e939 --- /dev/null +++ b/library/src/level1/rocsparse_doti.hpp @@ -0,0 +1,28 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_DOTI_HPP +#define ROCSPARSE_DOTI_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "doti_device.h" + +#include + +template +rocsparse_status rocsparse_doti_template(rocsparse_handle handle, + rocsparse_int nnz, + const T* x_val, + const rocsparse_int* x_ind, + const T* y, + T* result, + rocsparse_index_base idx_base) +{ + return rocsparse_status_not_implemented; +} + +#endif // ROCSPARSE_DOTI_HPP diff --git a/library/src/level1/rocsparse_gthr.cpp b/library/src/level1/rocsparse_gthr.cpp new file mode 100644 index 00000000..e2ac4fd3 --- /dev/null +++ b/library/src/level1/rocsparse_gthr.cpp @@ -0,0 +1,32 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_gthr.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_sgthr(rocsparse_handle handle, + rocsparse_int nnz, + const float* y, + float* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_gthr_template(handle, nnz, y, x_val, x_ind, idx_base); +} + +extern "C" rocsparse_status rocsparse_dgthr(rocsparse_handle handle, + rocsparse_int nnz, + const double* y, + double* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_gthr_template(handle, nnz, y, x_val, x_ind, idx_base); +} diff --git a/library/src/level1/rocsparse_gthr.hpp b/library/src/level1/rocsparse_gthr.hpp new file mode 100644 index 00000000..5b69b454 --- /dev/null +++ b/library/src/level1/rocsparse_gthr.hpp @@ -0,0 +1,27 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_GTHR_HPP +#define ROCSPARSE_GTHR_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "gthr_device.h" + +#include + +template +rocsparse_status rocsparse_gthr_template(rocsparse_handle handle, + rocsparse_int nnz, + const T* y, + T* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_status_not_implemented; +} + +#endif // ROCSPARSE_GTHR_HPP diff --git a/library/src/level1/rocsparse_gthrz.cpp b/library/src/level1/rocsparse_gthrz.cpp new file mode 100644 index 00000000..d376e917 --- /dev/null +++ b/library/src/level1/rocsparse_gthrz.cpp @@ -0,0 +1,32 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_gthrz.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_sgthrz(rocsparse_handle handle, + rocsparse_int nnz, + const float* y, + float* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_gthrz_template(handle, nnz, y, x_val, x_ind, idx_base); +} + +extern "C" rocsparse_status rocsparse_dgthrz(rocsparse_handle handle, + rocsparse_int nnz, + const double* y, + double* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_gthrz_template(handle, nnz, y, x_val, x_ind, idx_base); +} diff --git a/library/src/level1/rocsparse_gthrz.hpp b/library/src/level1/rocsparse_gthrz.hpp new file mode 100644 index 00000000..747edd73 --- /dev/null +++ b/library/src/level1/rocsparse_gthrz.hpp @@ -0,0 +1,27 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_GTHRZ_HPP +#define ROCSPARSE_GTHRZ_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "gthrz_device.h" + +#include + +template +rocsparse_status rocsparse_gthrz_template(rocsparse_handle handle, + rocsparse_int nnz, + const T* y, + T* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_status_not_implemented; +} + +#endif // ROCSPARSE_GTHRZ_HPP diff --git a/library/src/level1/rocsparse_roti.cpp b/library/src/level1/rocsparse_roti.cpp new file mode 100644 index 00000000..05c29d9d --- /dev/null +++ b/library/src/level1/rocsparse_roti.cpp @@ -0,0 +1,36 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_roti.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_sroti(rocsparse_handle handle, + rocsparse_int nnz, + float* x_val, + const rocsparse_int* x_ind, + float* y, + const float* c, + const float* s, + rocsparse_index_base idx_base) +{ + return rocsparse_roti_template(handle, nnz, x_val, x_ind, y, c, s, idx_base); +} + +extern "C" rocsparse_status rocsparse_droti(rocsparse_handle handle, + rocsparse_int nnz, + double* x_val, + const rocsparse_int* x_ind, + double* y, + const double* c, + const double* s, + rocsparse_index_base idx_base) +{ + return rocsparse_roti_template(handle, nnz, x_val, x_ind, y, c, s, idx_base); +} diff --git a/library/src/level1/rocsparse_roti.hpp b/library/src/level1/rocsparse_roti.hpp new file mode 100644 index 00000000..89356b21 --- /dev/null +++ b/library/src/level1/rocsparse_roti.hpp @@ -0,0 +1,29 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_ROTI_HPP +#define ROCSPARSE_ROTI_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "roti_device.h" + +#include + +template +rocsparse_status rocsparse_roti_template(rocsparse_handle handle, + rocsparse_int nnz, + T* x_val, + const rocsparse_int* x_ind, + T* y, + const T* c, + const T* s, + rocsparse_index_base idx_base) +{ + return rocsparse_status_not_implemented; +} + +#endif // ROCSPARSE_ROTI_HPP diff --git a/library/src/level1/rocsparse_sctr.cpp b/library/src/level1/rocsparse_sctr.cpp new file mode 100644 index 00000000..96026ff8 --- /dev/null +++ b/library/src/level1/rocsparse_sctr.cpp @@ -0,0 +1,32 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_sctr.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_ssctr(rocsparse_handle handle, + rocsparse_int nnz, + const float* x_val, + const rocsparse_int* x_ind, + float* y, + rocsparse_index_base idx_base) +{ + return rocsparse_sctr_template(handle, nnz, x_val, x_ind, y, idx_base); +} + +extern "C" rocsparse_status rocsparse_dsctr(rocsparse_handle handle, + rocsparse_int nnz, + const double* x_val, + const rocsparse_int* x_ind, + double* y, + rocsparse_index_base idx_base) +{ + return rocsparse_sctr_template(handle, nnz, x_val, x_ind, y, idx_base); +} diff --git a/library/src/level1/rocsparse_sctr.hpp b/library/src/level1/rocsparse_sctr.hpp new file mode 100644 index 00000000..86348b10 --- /dev/null +++ b/library/src/level1/rocsparse_sctr.hpp @@ -0,0 +1,27 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_SCTR_HPP +#define ROCSPARSE_SCTR_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "sctr_device.h" + +#include + +template +rocsparse_status rocsparse_sctr_template(rocsparse_handle handle, + rocsparse_int nnz, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base) +{ + return rocsparse_status_not_implemented; +} + +#endif // ROCSPARSE_SCTR_HPP diff --git a/library/src/level1/roti_device.h b/library/src/level1/roti_device.h new file mode 100644 index 00000000..e69de29b diff --git a/library/src/level1/sctr_device.h b/library/src/level1/sctr_device.h new file mode 100644 index 00000000..e69de29b From 58d6bc5b40823ef048e5c285c4ea4982ad55060b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sat, 2 Jun 2018 22:40:19 +0200 Subject: [PATCH 111/304] gthr(z) kernels added --- library/src/level1/axpyi_device.h | 6 +++--- library/src/level1/gthr_device.h | 28 ++++++++++++++++++++++++++++ library/src/level1/gthrz_device.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/library/src/level1/axpyi_device.h b/library/src/level1/axpyi_device.h index 474505ed..aac0f269 100644 --- a/library/src/level1/axpyi_device.h +++ b/library/src/level1/axpyi_device.h @@ -17,14 +17,14 @@ __device__ void axpyi_device(rocsparse_int nnz, T* y, rocsparse_index_base idx_base) { - int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - if(tid >= nnz) + if(idx >= nnz) { return; } - y[x_ind[tid] - idx_base] += alpha * x_val[tid]; + y[x_ind[idx] - idx_base] += alpha * x_val[idx]; } #endif // AXPYI_DEVICE_H diff --git a/library/src/level1/gthr_device.h b/library/src/level1/gthr_device.h index e69de29b..9202b63c 100644 --- a/library/src/level1/gthr_device.h +++ b/library/src/level1/gthr_device.h @@ -0,0 +1,28 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef GTHR_DEVICE_H +#define GTHR_DEVICE_H + +#include + +template +__global__ void gthr_device(rocsparse_int nnz, + const T* y, + T* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(idx >= nnz) + { + return; + } + + x_val[idx] = y[x_ind[idx] - idx_base]; +} + +#endif // GTHR_DEVICE_H diff --git a/library/src/level1/gthrz_device.h b/library/src/level1/gthrz_device.h index e69de29b..4796da47 100644 --- a/library/src/level1/gthrz_device.h +++ b/library/src/level1/gthrz_device.h @@ -0,0 +1,29 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef GTHRZ_DEVICE_H +#define GTHRZ_DEVICE_H + +#include + +template +__global__ void gthrz_device(rocsparse_int nnz, + const T* y, + T* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(idx >= nnz) + { + return; + } + + x_val[idx] = y[x_ind[idx] - idx_base]; + y[x_ind[idx] - idx_base] = static_cast(0); +} + +#endif // GTHRZ_DEVICE_H From 9db80bdc22f2f6047ef07b2759447fa04818b9fe Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 3 Jun 2018 13:52:45 +0200 Subject: [PATCH 112/304] level1 kernels --- library/src/level1/doti_device.h | 104 ++++++++++++++++++++++++++++++ library/src/level1/gthrz_device.h | 6 +- library/src/level1/roti_device.h | 36 +++++++++++ library/src/level1/sctr_device.h | 28 ++++++++ 4 files changed, 172 insertions(+), 2 deletions(-) diff --git a/library/src/level1/doti_device.h b/library/src/level1/doti_device.h index e69de29b..72bcc42d 100644 --- a/library/src/level1/doti_device.h +++ b/library/src/level1/doti_device.h @@ -0,0 +1,104 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef DOTI_DEVICE_H +#define DOTI_DEVICE_H + +#include + +template +__device__ void rocsparse_sum_reduce(rocsparse_int tid, T* x) +{ + // clang-format off + __syncthreads(); + if(n > 512) { if(tid < 512 && tid + 512 < n) { x[tid] += x[tid + 512]; } __syncthreads(); } + if(n > 256) { if(tid < 256 && tid + 256 < n) { x[tid] += x[tid + 256]; } __syncthreads(); } + if(n > 128) { if(tid < 128 && tid + 128 < n) { x[tid] += x[tid + 128]; } __syncthreads(); } + if(n > 64) { if(tid < 64 && tid + 64 < n) { x[tid] += x[tid + 64]; } __syncthreads(); } + if(n > 32) { if(tid < 32 && tid + 32 < n) { x[tid] += x[tid + 32]; } __syncthreads(); } + if(n > 16) { if(tid < 16 && tid + 16 < n) { x[tid] += x[tid + 16]; } __syncthreads(); } + if(n > 8) { if(tid < 8 && tid + 8 < n) { x[tid] += x[tid + 8]; } __syncthreads(); } + if(n > 4) { if(tid < 4 && tid + 4 < n) { x[tid] += x[tid + 4]; } __syncthreads(); } + if(n > 2) { if(tid < 2 && tid + 2 < n) { x[tid] += x[tid + 2]; } __syncthreads(); } + if(n > 1) { if(tid < 1 && tid + 1 < n) { x[tid] += x[tid + 1]; } __syncthreads(); } + // clang-format on +} + +template +__global__ void doti_device_part1(rocsparse_int nnz, + const T* x_val, + const rocsparse_int* x_ind, + const T* y, + T* workspace, + rocsparse_index_base idx_base) +{ + rocsparse_int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int tid = hipThreadIdx_x; + + __shared__ T sdata[NB]; + + if(idx < nnz) + { + sdata[tid] = y[x_ind[idx] - idx_base] * x_val[idx]; + } + else + { + sdata[tid] = static_cast(0); + } + + rocsparse_sum_reduce(tid, sdata); + + if(tid == 0) + { + workspace[hipBlockIdx_x] = sdata[0]; + } +} + +template +__global__ void doti_device_part2(rocsparse_int n, + T* workspace, + T* result) +{ + rocsparse_int tid = hipThreadIdx_x; + + __shared__ T sdata[NB]; + + sdata[tid] = static_cast(0); + + for(rocsparse_int i = tid; i < n; i += NB) + { + sdata[tid] += workspace[i]; + } + __syncthreads(); + + if(n < 32) + { + if(tid == 0) + { + for(rocsparse_int i = 1; i < n; ++i) + { + sdata[0] += sdata[i]; + } + } + } + else + { + rocsparse_sum_reduce(tid, sdata); + } + + if(tid == 0) + { + if(flag) + { + *result = sdata[0]; + } + else + { + workspace[0] = sdata[0]; + } + } +} + +#endif // DOTI_DEVICE_H diff --git a/library/src/level1/gthrz_device.h b/library/src/level1/gthrz_device.h index 4796da47..f8f944b1 100644 --- a/library/src/level1/gthrz_device.h +++ b/library/src/level1/gthrz_device.h @@ -22,8 +22,10 @@ __global__ void gthrz_device(rocsparse_int nnz, return; } - x_val[idx] = y[x_ind[idx] - idx_base]; - y[x_ind[idx] - idx_base] = static_cast(0); + rocsparse_int i = x_ind[idx] - idx_base; + + x_val[idx] = y[i]; + y[i] = static_cast(0); } #endif // GTHRZ_DEVICE_H diff --git a/library/src/level1/roti_device.h b/library/src/level1/roti_device.h index e69de29b..c74556cf 100644 --- a/library/src/level1/roti_device.h +++ b/library/src/level1/roti_device.h @@ -0,0 +1,36 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROTI_DEVICE_H +#define ROTI_DEVICE_H + +#include + +template +__device__ void roti_device(rocsparse_int nnz, + T* x_val, + const rocsparse_int* x_ind, + T* y, + T c, + T s, + rocsparse_index_base idx_base) +{ + int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(idx >= nnz) + { + return; + } + + int i = x_ind[idx - idx_base]; + + T xr = x_val[idx]; + T yr = y[i]; + + x_val[idx] = c * xr + s * yr; + y[i] = c * yr - s * xr; +} + +#endif // ROTI_DEVICE_H diff --git a/library/src/level1/sctr_device.h b/library/src/level1/sctr_device.h index e69de29b..785cc419 100644 --- a/library/src/level1/sctr_device.h +++ b/library/src/level1/sctr_device.h @@ -0,0 +1,28 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef SCTR_DEVICE_H +#define SCTR_DEVICE_H + +#include + +template +__global__ void sctr_device(rocsparse_int nnz, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base) +{ + int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(idx >= nnz) + { + return; + } + + y[x_ind[idx] - idx_base] = x_val[idx]; +} + +#endif // SCTR_DEVICE_H From 626036352b58225e1038d6e1e9e7e6e95c44b402 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 3 Jun 2018 14:24:54 +0200 Subject: [PATCH 113/304] filled level1 host template functions --- library/src/level1/doti_device.h | 4 +- library/src/level1/gthr_device.h | 2 +- library/src/level1/gthrz_device.h | 4 +- library/src/level1/rocsparse_doti.hpp | 122 ++++++++++++++++++++- library/src/level1/rocsparse_gthr.hpp | 67 +++++++++++- library/src/level1/rocsparse_gthrz.cpp | 4 +- library/src/level1/rocsparse_gthrz.hpp | 69 +++++++++++- library/src/level1/rocsparse_roti.hpp | 146 ++++++++++++++++++++++++- library/src/level1/rocsparse_sctr.hpp | 67 +++++++++++- library/src/level1/sctr_device.h | 2 +- 10 files changed, 473 insertions(+), 14 deletions(-) diff --git a/library/src/level1/doti_device.h b/library/src/level1/doti_device.h index 72bcc42d..28f25173 100644 --- a/library/src/level1/doti_device.h +++ b/library/src/level1/doti_device.h @@ -27,7 +27,7 @@ __device__ void rocsparse_sum_reduce(rocsparse_int tid, T* x) } template -__global__ void doti_device_part1(rocsparse_int nnz, +__global__ void doti_kernel_part1(rocsparse_int nnz, const T* x_val, const rocsparse_int* x_ind, const T* y, @@ -57,7 +57,7 @@ __global__ void doti_device_part1(rocsparse_int nnz, } template -__global__ void doti_device_part2(rocsparse_int n, +__global__ void doti_kernel_part2(rocsparse_int n, T* workspace, T* result) { diff --git a/library/src/level1/gthr_device.h b/library/src/level1/gthr_device.h index 9202b63c..d0e74cdb 100644 --- a/library/src/level1/gthr_device.h +++ b/library/src/level1/gthr_device.h @@ -9,7 +9,7 @@ #include template -__global__ void gthr_device(rocsparse_int nnz, +__global__ void gthr_kernel(rocsparse_int nnz, const T* y, T* x_val, const rocsparse_int* x_ind, diff --git a/library/src/level1/gthrz_device.h b/library/src/level1/gthrz_device.h index f8f944b1..1ffa444f 100644 --- a/library/src/level1/gthrz_device.h +++ b/library/src/level1/gthrz_device.h @@ -9,8 +9,8 @@ #include template -__global__ void gthrz_device(rocsparse_int nnz, - const T* y, +__global__ void gthrz_kernel(rocsparse_int nnz, + T* y, T* x_val, const rocsparse_int* x_ind, rocsparse_index_base idx_base) diff --git a/library/src/level1/rocsparse_doti.hpp b/library/src/level1/rocsparse_doti.hpp index cb47e939..955f0b8b 100644 --- a/library/src/level1/rocsparse_doti.hpp +++ b/library/src/level1/rocsparse_doti.hpp @@ -7,6 +7,7 @@ #define ROCSPARSE_DOTI_HPP #include "rocsparse.h" +#include "definitions.h" #include "handle.h" #include "utility.h" #include "doti_device.h" @@ -22,7 +23,126 @@ rocsparse_status rocsparse_doti_template(rocsparse_handle handle, T* result, rocsparse_index_base idx_base) { - return rocsparse_status_not_implemented; + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging // TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xdoti"), + nnz, + (const void*&)x_val, + (const void*&)x_ind, + (const void*&)y, + *result, + idx_base); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xdoti"), + nnz, + (const void*&)x_val, + (const void*&)x_ind, + (const void*&)y, + (const void*&)result, + idx_base); + } + + // Check index base + if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check size + if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(x_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(result == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define DOTI_DIM 512 + rocsparse_int nblocks = (nnz - 1) / DOTI_DIM + 1; + + // Allocate workspace + T* workspace = NULL; + RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(T) * nblocks)); + + dim3 doti_blocks(nblocks); + dim3 doti_threads(DOTI_DIM); + + hipLaunchKernelGGL((doti_kernel_part1), + doti_blocks, + doti_threads, + 0, + stream, + nnz, + x_val, + x_ind, + y, + workspace, + idx_base); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + hipLaunchKernelGGL((doti_kernel_part2), + dim3(1), + doti_threads, + 0, + stream, + nblocks, + workspace, + result); + } + else + { + if(nblocks > 1) + { + hipLaunchKernelGGL((doti_kernel_part2), + dim3(1), + doti_threads, + 0, + stream, + nblocks, + workspace, + result); + } + RETURN_IF_HIP_ERROR(hipMemcpy(result, workspace, sizeof(T), hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR(hipFree(workspace)); + } + + return rocsparse_status_success; } #endif // ROCSPARSE_DOTI_HPP diff --git a/library/src/level1/rocsparse_gthr.hpp b/library/src/level1/rocsparse_gthr.hpp index 5b69b454..599f8b6a 100644 --- a/library/src/level1/rocsparse_gthr.hpp +++ b/library/src/level1/rocsparse_gthr.hpp @@ -21,7 +21,72 @@ rocsparse_status rocsparse_gthr_template(rocsparse_handle handle, const rocsparse_int* x_ind, rocsparse_index_base idx_base) { - return rocsparse_status_not_implemented; + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging // TODO bench logging + log_trace(handle, + replaceX("rocsparse_Xgthr"), + nnz, + (const void*&)y, + (const void*&)x_val, + (const void*&)x_ind, + idx_base); + + // Check index base + if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check size + if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define GTHR_DIM 512 + dim3 gthr_blocks((nnz - 1) / GTHR_DIM + 1); + dim3 gthr_threads(GTHR_DIM); + + hipLaunchKernelGGL((gthr_kernel), + gthr_blocks, + gthr_threads, + 0, + stream, + nnz, + y, + x_val, + x_ind, + idx_base); +#undef GTHR_DIM + return rocsparse_status_success; } #endif // ROCSPARSE_GTHR_HPP diff --git a/library/src/level1/rocsparse_gthrz.cpp b/library/src/level1/rocsparse_gthrz.cpp index d376e917..283df86e 100644 --- a/library/src/level1/rocsparse_gthrz.cpp +++ b/library/src/level1/rocsparse_gthrz.cpp @@ -13,7 +13,7 @@ extern "C" rocsparse_status rocsparse_sgthrz(rocsparse_handle handle, rocsparse_int nnz, - const float* y, + float* y, float* x_val, const rocsparse_int* x_ind, rocsparse_index_base idx_base) @@ -23,7 +23,7 @@ extern "C" rocsparse_status rocsparse_sgthrz(rocsparse_handle handle, extern "C" rocsparse_status rocsparse_dgthrz(rocsparse_handle handle, rocsparse_int nnz, - const double* y, + double* y, double* x_val, const rocsparse_int* x_ind, rocsparse_index_base idx_base) diff --git a/library/src/level1/rocsparse_gthrz.hpp b/library/src/level1/rocsparse_gthrz.hpp index 747edd73..32243ef1 100644 --- a/library/src/level1/rocsparse_gthrz.hpp +++ b/library/src/level1/rocsparse_gthrz.hpp @@ -16,12 +16,77 @@ template rocsparse_status rocsparse_gthrz_template(rocsparse_handle handle, rocsparse_int nnz, - const T* y, + T* y, T* x_val, const rocsparse_int* x_ind, rocsparse_index_base idx_base) { - return rocsparse_status_not_implemented; + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging // TODO bench logging + log_trace(handle, + replaceX("rocsparse_Xgthrz"), + nnz, + (const void*&)y, + (const void*&)x_val, + (const void*&)x_ind, + idx_base); + + // Check index base + if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check size + if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define GTHRZ_DIM 512 + dim3 gthrz_blocks((nnz - 1) / GTHRZ_DIM + 1); + dim3 gthrz_threads(GTHRZ_DIM); + + hipLaunchKernelGGL((gthrz_kernel), + gthrz_blocks, + gthrz_threads, + 0, + stream, + nnz, + y, + x_val, + x_ind, + idx_base); +#undef GTHRZ_DIM + return rocsparse_status_success; } #endif // ROCSPARSE_GTHRZ_HPP diff --git a/library/src/level1/rocsparse_roti.hpp b/library/src/level1/rocsparse_roti.hpp index 89356b21..0a82122d 100644 --- a/library/src/level1/rocsparse_roti.hpp +++ b/library/src/level1/rocsparse_roti.hpp @@ -13,6 +13,35 @@ #include +template +__global__ void roti_kernel_host_scalar(rocsparse_int nnz, + T* x_val, + const rocsparse_int* x_ind, + T* y, + T c, + T s, + rocsparse_index_base idx_base) +{ + roti_device(nnz, x_val, x_ind, y, c, s, idx_base); +} + +template +__global__ void roti_kernel_device_scalar(rocsparse_int nnz, + T* x_val, + const rocsparse_int* x_ind, + T* y, + const T* c, + const T* s, + rocsparse_index_base idx_base) +{ + if(*c == static_cast(1) && *s == static_cast(0)) + { + return; + } + + roti_device(nnz, x_val, x_ind, y, *c, *s, idx_base); +} + template rocsparse_status rocsparse_roti_template(rocsparse_handle handle, rocsparse_int nnz, @@ -23,7 +52,122 @@ rocsparse_status rocsparse_roti_template(rocsparse_handle handle, const T* s, rocsparse_index_base idx_base) { - return rocsparse_status_not_implemented; + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging // TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xroti"), + nnz, + (const void*&)x_val, + (const void*&)x_ind, + (const void*&)y, + *c, + *s, + idx_base); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xroti"), + nnz, + (const void*&)x_val, + (const void*&)x_ind, + (const void*&)y, + (const void*&)c, + (const void*&)s, + idx_base); + } + + // Check index base + if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check size + if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(c == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(s == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define ROTI_DIM 512 + dim3 roti_blocks((nnz - 1) / ROTI_DIM + 1); + dim3 roti_threads(ROTI_DIM); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + hipLaunchKernelGGL((roti_kernel_device_scalar), + roti_blocks, + roti_threads, + 0, + stream, + nnz, + x_val, + x_ind, + y, + c, + s, + idx_base); + } + else + { + if(*c == static_cast(1) && *s == static_cast(0)) + { + return rocsparse_status_success; + } + + hipLaunchKernelGGL((roti_kernel_host_scalar), + roti_blocks, + roti_threads, + 0, + stream, + nnz, + x_val, + x_ind, + y, + *c, + *s, + idx_base); + } +#undef ROTI_DIM + return rocsparse_status_success; } #endif // ROCSPARSE_ROTI_HPP diff --git a/library/src/level1/rocsparse_sctr.hpp b/library/src/level1/rocsparse_sctr.hpp index 86348b10..afebb4ab 100644 --- a/library/src/level1/rocsparse_sctr.hpp +++ b/library/src/level1/rocsparse_sctr.hpp @@ -21,7 +21,72 @@ rocsparse_status rocsparse_sctr_template(rocsparse_handle handle, T* y, rocsparse_index_base idx_base) { - return rocsparse_status_not_implemented; + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging // TODO bench logging + log_trace(handle, + replaceX("rocsparse_Xsctr"), + nnz, + (const void*&)x_val, + (const void*&)x_ind, + (const void*&)y, + idx_base); + + // Check index base + if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check size + if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(x_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define SCTR_DIM 512 + dim3 sctr_blocks((nnz - 1) / SCTR_DIM + 1); + dim3 sctr_threads(SCTR_DIM); + + hipLaunchKernelGGL((sctr_kernel), + sctr_blocks, + sctr_threads, + 0, + stream, + nnz, + x_val, + x_ind, + y, + idx_base); +#undef SCTR_DIM + return rocsparse_status_success; } #endif // ROCSPARSE_SCTR_HPP diff --git a/library/src/level1/sctr_device.h b/library/src/level1/sctr_device.h index 785cc419..b4c6ac62 100644 --- a/library/src/level1/sctr_device.h +++ b/library/src/level1/sctr_device.h @@ -9,7 +9,7 @@ #include template -__global__ void sctr_device(rocsparse_int nnz, +__global__ void sctr_kernel(rocsparse_int nnz, const T* x_val, const rocsparse_int* x_ind, T* y, From 34e94e84a3c261273676c105d777eb1602d5cb1c Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 3 Jun 2018 14:51:34 +0200 Subject: [PATCH 114/304] added level1 functions to API --- library/include/rocsparse-functions.h | 332 ++++++++++++++++++++++++++ 1 file changed, 332 insertions(+) diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 7d3a5fd1..5002814e 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -89,6 +89,338 @@ rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, rocsparse_index_base idx_base); */ +/*! \brief SPARSE Level 1 API + + \details + doti computes the dot product of the sparse vector x and the dense vector + y + + result := y^T x + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + nnz number of non-zero entries of x. + @param[in] + x_val array of nnz values. + @param[in] + x_ind array of nnz elements containing the indices of the non-zero + values of x. + @param[in] + y array of values in dense format. + @param[out] + result pointer to the result, can be host or device memory + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_sdoti(rocsparse_handle handle, + rocsparse_int nnz, + const float* x_val, + const rocsparse_int* x_ind, + const float* y, + float* result, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_ddoti(rocsparse_handle handle, + rocsparse_int nnz, + const double* x_val, + const rocsparse_int* x_ind, + const double* y, + double* result, + rocsparse_index_base idx_base); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_cdoti(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_float_complex* x_val, + const rocsparse_int* x_ind, + const rocsparse_float_complex* y, + rocsparse_float_complex* result, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zdoti(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_double_complex* x_val, + const rocsparse_int* x_ind, + const rocsparse_double_complex* y, + rocsparse_double_complex* result, + rocsparse_index_base idx_base); +*/ + +/*! \brief SPARSE Level 1 API + + \details + dotci computes the dot product of the sparse conjugate complex vector x + and the dense vector y + + result := x^H y + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + nnz number of non-zero entries of x. + @param[in] + x_val array of nnz values. + @param[in] + x_ind array of nnz elements containing the indices of the non-zero + values of x. + @param[in] + y array of values in dense format. + @param[out] + result pointer to the result, can be host or device memory + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + + ********************************************************************/ +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_cdotci(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_float_complex* x_val, + const rocsparse_int* x_ind, + const rocsparse_float_complex* y, + rocsparse_float_complex* result, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zdotci(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_double_complex* x_val, + const rocsparse_int* x_ind, + const rocsparse_double_complex* y, + rocsparse_double_complex* result, + rocsparse_index_base idx_base); +*/ + +/*! \brief SPARSE Level 1 API + + \details + gthr gathers the elements that are listed in x_ind from the dense + vector y and stores them in the sparse vector x + + x := y(x_ind) + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + nnz number of non-zero entries of x. + @param[in] + y array of values in dense format. + @param[out] + x_val array of nnz values. + @param[in] + x_ind array of nnz elements containing the indices of the non-zero + values of x. + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_sgthr(rocsparse_handle handle, + rocsparse_int nnz, + const float* y, + float* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dgthr(rocsparse_handle handle, + rocsparse_int nnz, + const double* y, + double* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_cgthr(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_float_complex* y, + rocsparse_float_complex* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zgthr(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_double_complex* y, + rocsparse_double_complex* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); +*/ + +/*! \brief SPARSE Level 1 API + + \details + gthrz gathers the elements that are listed in x_ind from the dense + vector y and stores them in the sparse vector x. The gathered elements + are replaced by zero in y. + + x := y(x_ind) + y(x_ind) := 0 + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + nnz number of non-zero entries of x. + @param[inout] + y array of values in dense format. + @param[out] + x_val array of nnz values. + @param[in] + x_ind array of nnz elements containing the indices of the non-zero + values of x. + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_sgthrz(rocsparse_handle handle, + rocsparse_int nnz, + float* y, + float* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dgthrz(rocsparse_handle handle, + rocsparse_int nnz, + double* y, + double* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_cgthrz(rocsparse_handle handle, + rocsparse_int nnz, + rocsparse_float_complex* y, + rocsparse_float_complex* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zgthrz(rocsparse_handle handle, + rocsparse_int nnz, + rocsparse_double_complex* y, + rocsparse_double_complex* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); +*/ + +/*! \brief SPARSE Level 1 API + + \details + roti applies the Givens rotation matrix G to the sparse vector x and + the dense vector y + + G = ( c s) + (-s c) + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + nnz number of non-zero entries of x. + @param[inout] + x_val array of nnz values. + @param[in] + x_ind array of nnz elements containing the indices of the non-zero + values of x. + @param[inout] + y array of values in dense format. + @param[in] + c pointer to the cosine element of G, can be on host or device + @param[in] + s pointer to the sine element of G, can be on host or device + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_sroti(rocsparse_handle handle, + rocsparse_int nnz, + float* x_val, + const rocsparse_int* x_ind, + float* y, + const float* c, + const float* s, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_droti(rocsparse_handle handle, + rocsparse_int nnz, + double* x_val, + const rocsparse_int* x_ind, + double* y, + const double* c, + const double* s, + rocsparse_index_base idx_base); + +/*! \brief SPARSE Level 1 API + + \details + sctr scatters the elements that are listed in x_ind from the sparse + vector x into the dense vector y. Indices of y that are not listed + in x_ind remain unchanged + + y(x_ind) := x + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + nnz number of non-zero entries of x. + @param[in] + x_val array of nnz values. + @param[in] + x_ind array of nnz elements containing the indices of the non-zero + values of x. + @param[out] + y array of values in dense format. + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_ssctr(rocsparse_handle handle, + rocsparse_int nnz, + const float* x_val, + const rocsparse_int* x_ind, + float* y, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dsctr(rocsparse_handle handle, + rocsparse_int nnz, + const double* x_val, + const rocsparse_int* x_ind, + double* y, + rocsparse_index_base idx_base); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csctr(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_float_complex* x_val, + const rocsparse_int* x_ind, + rocsparse_float_complex* y, + rocsparse_index_base idx_base); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zsctr(rocsparse_handle handle, + rocsparse_int nnz, + const rocsparse_double_complex* x_val, + const rocsparse_int* x_ind, + rocsparse_double_complex* y, + rocsparse_index_base idx_base); +*/ + /* * =========================================================================== * level 2 SPARSE From 49a4816a060dbe9f3ef4d92871568039d2f9fdb0 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 3 Jun 2018 19:38:09 +0200 Subject: [PATCH 115/304] level1 tests: doti, gthr, gthrz --- .../rocsparse_template_specialization.cpp | 116 +++++++++ clients/include/rocsparse.hpp | 43 ++++ clients/include/testing_doti.hpp | 240 ++++++++++++++++++ clients/include/testing_gthr.hpp | 210 +++++++++++++++ clients/include/testing_gthrz.hpp | 216 ++++++++++++++++ clients/tests/CMakeLists.txt | 3 + clients/tests/test_doti.cpp | 61 +++++ clients/tests/test_gthr.cpp | 61 +++++ clients/tests/test_gthrz.cpp | 61 +++++ 9 files changed, 1011 insertions(+) create mode 100644 clients/include/testing_doti.hpp create mode 100644 clients/include/testing_gthr.hpp create mode 100644 clients/include/testing_gthrz.hpp create mode 100644 clients/tests/test_doti.cpp create mode 100644 clients/tests/test_gthr.cpp create mode 100644 clients/tests/test_gthrz.cpp diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 7d253fd0..3147d7b7 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -32,6 +32,122 @@ rocsparse_status rocsparse_axpyi(rocsparse_handle handle, return rocsparse_daxpyi(handle, nnz, alpha, x_val, x_ind, y, idx_base); } +template <> +rocsparse_status rocsparse_doti(rocsparse_handle handle, + rocsparse_int nnz, + const float* x_val, + const rocsparse_int* x_ind, + const float* y, + float* result, + rocsparse_index_base idx_base) +{ + return rocsparse_sdoti(handle, nnz, x_val, x_ind, y, result, idx_base); +} + +template <> +rocsparse_status rocsparse_doti(rocsparse_handle handle, + rocsparse_int nnz, + const double* x_val, + const rocsparse_int* x_ind, + const double* y, + double* result, + rocsparse_index_base idx_base) +{ + return rocsparse_ddoti(handle, nnz, x_val, x_ind, y, result, idx_base); +} + +template <> +rocsparse_status rocsparse_gthr(rocsparse_handle handle, + rocsparse_int nnz, + const float* y, + float* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_sgthr(handle, nnz, y, x_val, x_ind, idx_base); +} + +template <> +rocsparse_status rocsparse_gthr(rocsparse_handle handle, + rocsparse_int nnz, + const double* y, + double* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_dgthr(handle, nnz, y, x_val, x_ind, idx_base); +} + +template <> +rocsparse_status rocsparse_gthrz(rocsparse_handle handle, + rocsparse_int nnz, + float* y, + float* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_sgthrz(handle, nnz, y, x_val, x_ind, idx_base); +} + +template <> +rocsparse_status rocsparse_gthrz(rocsparse_handle handle, + rocsparse_int nnz, + double* y, + double* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base) +{ + return rocsparse_dgthrz(handle, nnz, y, x_val, x_ind, idx_base); +} + +template <> +rocsparse_status rocsparse_roti(rocsparse_handle handle, + rocsparse_int nnz, + float* x_val, + const rocsparse_int* x_ind, + float* y, + const float* c, + const float* s, + rocsparse_index_base idx_base) +{ + return rocsparse_sroti(handle, nnz, x_val, x_ind, y, c, s, idx_base); +} + +template <> +rocsparse_status rocsparse_roti(rocsparse_handle handle, + rocsparse_int nnz, + double* x_val, + const rocsparse_int* x_ind, + double* y, + const double* c, + const double* s, + rocsparse_index_base idx_base) +{ + return rocsparse_droti(handle, nnz, x_val, x_ind, y, c, s, idx_base); +} + +template <> +rocsparse_status rocsparse_sctr(rocsparse_handle handle, + rocsparse_int nnz, + const float* x_val, + const rocsparse_int* x_ind, + float* y, + rocsparse_index_base idx_base) +{ + return rocsparse_ssctr(handle, nnz, x_val, x_ind, y, idx_base); +} + +template <> +rocsparse_status rocsparse_sctr(rocsparse_handle handle, + rocsparse_int nnz, + const double* x_val, + const rocsparse_int* x_ind, + double* y, + rocsparse_index_base idx_base) +{ + return rocsparse_dsctr(handle, nnz, x_val, x_ind, y, idx_base); +} + template <> rocsparse_status rocsparse_coomv(rocsparse_handle handle, rocsparse_operation trans, diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index 4a6ced4f..c24279c8 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -19,6 +19,49 @@ rocsparse_status rocsparse_axpyi(rocsparse_handle handle, T* y, rocsparse_index_base idx_base); +template +rocsparse_status rocsparse_doti(rocsparse_handle handle, + rocsparse_int nnz, + const T* x_val, + const rocsparse_int* x_ind, + const T* y, + T* result, + rocsparse_index_base idx_base); + +template +rocsparse_status rocsparse_gthr(rocsparse_handle handle, + rocsparse_int nnz, + const T* y, + T* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); + +template +rocsparse_status rocsparse_gthrz(rocsparse_handle handle, + rocsparse_int nnz, + T* y, + T* x_val, + const rocsparse_int* x_ind, + rocsparse_index_base idx_base); + +template +rocsparse_status rocsparse_roti(rocsparse_handle handle, + rocsparse_int nnz, + T* x_val, + const rocsparse_int* x_ind, + T* y, + const T* c, + const T* s, + rocsparse_index_base idx_base); + +template +rocsparse_status rocsparse_sctr(rocsparse_handle handle, + rocsparse_int nnz, + const T* x_val, + const rocsparse_int* x_ind, + T* y, + rocsparse_index_base idx_base); + template rocsparse_status rocsparse_coomv(rocsparse_handle handle, rocsparse_operation trans, diff --git a/clients/include/testing_doti.hpp b/clients/include/testing_doti.hpp new file mode 100644 index 00000000..8ea29a12 --- /dev/null +++ b/clients/include/testing_doti.hpp @@ -0,0 +1,240 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_DOTI_HPP +#define TESTING_DOTI_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_doti_bad_arg(void) +{ + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + + rocsparse_index_base idx_base = rocsparse_index_base_zero; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + T* dx_val = (T*)dx_val_managed.get(); + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + T result; + + // testing for (nullptr == dx_val) + { + T* dx_val_null = nullptr; + + status = rocsparse_doti(handle, nnz, dx_val_null, dx_ind, dy, &result, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_val is nullptr"); + } + + // testing for (nullptr == dx_ind) + { + rocsparse_int* dx_ind_null = nullptr; + + status = rocsparse_doti(handle, nnz, dx_val, dx_ind_null, dy, &result, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_ind is nullptr"); + } + + // testing for (nullptr == dy) + { + T* dy_null = nullptr; + + status = rocsparse_doti(handle, nnz, dx_val, dx_ind, dy_null, &result, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); + } + + // testing for (nullptr == result) + { + T* result_null = nullptr; + + status = rocsparse_doti(handle, nnz, dx_val, dx_ind, dy, result_null, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: result is nullptr"); + } + + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_doti(handle_null, nnz, dx_val, dx_ind, dy, &result, idx_base); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_doti(Arguments argus) +{ + rocsparse_int N = argus.N; + rocsparse_int nnz = argus.nnz; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + // Argument sanity check before allocating invalid memory + if(nnz <= 0) + { + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dx_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val = (T*)dx_val_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx_ind || !dx_val || !dy"); + return rocsparse_status_memory_error; + } + + T result; + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_doti(handle, nnz, dx_val, dx_ind, dy, &result, idx_base); + + if(nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "nnz == 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hx_ind(nnz); + std::vector hx_val(nnz); + std::vector hy(N); + + T hresult_1; + T hresult_2; + T hresult_gold; + + // Initial Data on CPU + srand(12345ULL); + rocsparse_init_index(hx_ind.data(), nnz, 1, N); + rocsparse_init(hx_val, 1, nnz); + rocsparse_init(hy, 1, N); + + // allocate memory on device + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dresult_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val = (T*)dx_val_managed.get(); + T* dy = (T*)dy_managed.get(); + T* dresult_2 = (T*)dresult_2_managed.get(); + + if(!dx_ind || !dx_val || !dy || !dresult_2) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx_ind || !dx_val || !dy || !dresult_2"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR( + hipMemcpy(dx_ind, hx_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx_val, hx_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_doti(handle, nnz, dx_val, dx_ind, dy, &hresult_1, idx_base)); + + // ROCSPARSE pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_doti(handle, nnz, dx_val, dx_ind, dy, dresult_2, idx_base)); + + // copy output from device to CPU^ + CHECK_HIP_ERROR(hipMemcpy(&hresult_2, dresult_2, sizeof(T), hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + hresult_gold = static_cast(0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + hresult_gold += hy[hx_ind[i] - idx_base] * hx_val[i]; + } + + cpu_time_used = get_time_us() - cpu_time_used; + + // enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + unit_check_general(1, 1, &hresult_gold, &hresult_1); + unit_check_general(1, 1, &hresult_gold, &hresult_2); + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(rocsparse_int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_doti(handle, nnz, dx_val, dx_ind, dy, &hresult_1, idx_base); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(rocsparse_int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_doti(handle, nnz, dx_val, dx_ind, dy, &hresult_1, idx_base); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; + double gpu_gflops = (2.0 * nnz) / 1e9 / gpu_time_used * 1e6 * 1; + double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * nnz * 2.0) / gpu_time_used / 1e3; + + printf("nnz\t\tGFlops\tGB/s\tusec\n"); + printf("%9d\t%0.2lf\t%0.2lf\t%0.2lf\n", + nnz, + gpu_gflops, + bandwidth, + gpu_time_used); + } + return rocsparse_status_success; +} + +#endif // TESTING_DOTI_HPP diff --git a/clients/include/testing_gthr.hpp b/clients/include/testing_gthr.hpp new file mode 100644 index 00000000..4f86d4d9 --- /dev/null +++ b/clients/include/testing_gthr.hpp @@ -0,0 +1,210 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_GTHR_HPP +#define TESTING_GTHR_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_gthr_bad_arg(void) +{ + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + + rocsparse_index_base idx_base = rocsparse_index_base_zero; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + T* dx_val = (T*)dx_val_managed.get(); + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for(nullptr == dx_ind) + { + rocsparse_int* dx_ind_null = nullptr; + + status = rocsparse_gthr(handle, nnz, dy, dx_val, dx_ind_null, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_ind is nullptr"); + } + // testing for(nullptr == dx_val) + { + T* dx_val_null = nullptr; + + status = rocsparse_gthr(handle, nnz, dy, dx_val_null, dx_ind, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_val is nullptr"); + } + // testing for(nullptr == dy) + { + T* dy_null = nullptr; + + status = rocsparse_gthr(handle, nnz, dy_null, dx_val, dx_ind, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_gthr(handle_null, nnz, dy, dx_val, dx_ind, idx_base); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_gthr(Arguments argus) +{ + rocsparse_int N = argus.N; + rocsparse_int nnz = argus.nnz; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + // Argument sanity check before allocating invalid memory + if(nnz <= 0) + { + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dx_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val = (T*)dx_val_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx_ind || !dx_val || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_gthr(handle, nnz, dy, dx_val, dx_ind, idx_base); + + if(nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "nnz == 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hx_ind(nnz); + std::vector hx_val(nnz); + std::vector hx_val_gold(nnz); + std::vector hy(N); + + // Initial Data on CPU + srand(12345ULL); + rocsparse_init_index(hx_ind.data(), nnz, 1, N); + rocsparse_init(hy, 1, N); + + // allocate memory on device + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val = (T*)dx_val_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx_ind || !dx_val || !dy"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR( + hipMemcpy(dx_ind, hx_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_gthr(handle, nnz, dy, dx_val, dx_ind, idx_base)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hx_val.data(), dx_val, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + for(rocsparse_int i = 0; i < nnz; ++i) + { + hx_val_gold[i] = hy[hx_ind[i] - idx_base]; + } + + cpu_time_used = get_time_us() - cpu_time_used; + + // enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + unit_check_general(1, nnz, hx_val_gold.data(), hx_val.data()); + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(rocsparse_int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_gthr(handle, nnz, dy, dx_val, dx_ind, idx_base); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(rocsparse_int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_gthr(handle, nnz, dy, dx_val, dx_ind, idx_base); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; + double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; + + printf("nnz\t\tGB/s\tusec\n"); + printf("%9d\t%0.2lf\t%0.2lf\n", + nnz, + bandwidth, + gpu_time_used); + } + return rocsparse_status_success; +} + +#endif // TESTING_GTHR_HPP diff --git a/clients/include/testing_gthrz.hpp b/clients/include/testing_gthrz.hpp new file mode 100644 index 00000000..a6726bf7 --- /dev/null +++ b/clients/include/testing_gthrz.hpp @@ -0,0 +1,216 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_GTHRZ_HPP +#define TESTING_GTHRZ_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_gthrz_bad_arg(void) +{ + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + + rocsparse_index_base idx_base = rocsparse_index_base_zero; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + T* dx_val = (T*)dx_val_managed.get(); + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for(nullptr == dx_ind) + { + rocsparse_int* dx_ind_null = nullptr; + + status = rocsparse_gthrz(handle, nnz, dy, dx_val, dx_ind_null, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_ind is nullptr"); + } + // testing for(nullptr == dx_val) + { + T* dx_val_null = nullptr; + + status = rocsparse_gthrz(handle, nnz, dy, dx_val_null, dx_ind, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_val is nullptr"); + } + // testing for(nullptr == dy) + { + T* dy_null = nullptr; + + status = rocsparse_gthrz(handle, nnz, dy_null, dx_val, dx_ind, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_gthrz(handle_null, nnz, dy, dx_val, dx_ind, idx_base); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_gthrz(Arguments argus) +{ + rocsparse_int N = argus.N; + rocsparse_int nnz = argus.nnz; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + // Argument sanity check before allocating invalid memory + if(nnz <= 0) + { + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dx_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val = (T*)dx_val_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx_ind || !dx_val || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_gthrz(handle, nnz, dy, dx_val, dx_ind, idx_base); + + if(nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "nnz == 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hx_ind(nnz); + std::vector hx_val(nnz); + std::vector hx_val_gold(nnz); + std::vector hy(N); + std::vector hy_gold(N); + + // Initial Data on CPU + srand(12345ULL); + rocsparse_init_index(hx_ind.data(), nnz, 1, N); + rocsparse_init(hy, 1, N); + + hy_gold = hy; + + // allocate memory on device + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val = (T*)dx_val_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx_ind || !dx_val || !dy"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR( + hipMemcpy(dx_ind, hx_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_gthrz(handle, nnz, dy, dx_val, dx_ind, idx_base)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hx_val.data(), dx_val, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy.data(), dy, sizeof(T) * N, hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + for(rocsparse_int i = 0; i < nnz; ++i) + { + hx_val_gold[i] = hy_gold[hx_ind[i] - idx_base]; + hy_gold[hx_ind[i] - idx_base] = static_cast(0); + } + + cpu_time_used = get_time_us() - cpu_time_used; + + // enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + unit_check_general(1, nnz, hx_val_gold.data(), hx_val.data()); + unit_check_general(1, N, hy_gold.data(), hy.data()); + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(rocsparse_int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_gthrz(handle, nnz, dy, dx_val, dx_ind, idx_base); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(rocsparse_int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_gthrz(handle, nnz, dy, dx_val, dx_ind, idx_base); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; + double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; + + printf("nnz\t\tGB/s\tusec\n"); + printf("%9d\t%0.2lf\t%0.2lf\n", + nnz, + bandwidth, + gpu_time_used); + } + return rocsparse_status_success; +} + +#endif // TESTING_GTHRZ_HPP diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 711c324d..d9e90885 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -7,6 +7,9 @@ find_package(GTest REQUIRED) set(ROCSPARSE_TEST_SOURCES rocsparse_gtest_main.cpp test_axpyi.cpp + test_doti.cpp + test_gthr.cpp + test_gthrz.cpp test_coomv.cpp test_csrmv.cpp test_ellmv.cpp diff --git a/clients/tests/test_doti.cpp b/clients/tests/test_doti.cpp new file mode 100644 index 00000000..30b0e96f --- /dev/null +++ b/clients/tests/test_doti.cpp @@ -0,0 +1,61 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_doti.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple doti_tuple; + +int doti_N_range[] = {12000, 15332, 22031}; +int doti_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; + +base doti_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +class parameterized_doti : public testing::TestWithParam +{ + protected: + parameterized_doti() {} + virtual ~parameterized_doti() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_doti_arguments(doti_tuple tup) +{ + Arguments arg; + arg.N = std::get<0>(tup); + arg.nnz = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; + return arg; +} + +TEST(doti_bad_arg, doti_float) { testing_doti_bad_arg(); } + +TEST_P(parameterized_doti, doti_float) +{ + Arguments arg = setup_doti_arguments(GetParam()); + + rocsparse_status status = testing_doti(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_doti, doti_double) +{ + Arguments arg = setup_doti_arguments(GetParam()); + + rocsparse_status status = testing_doti(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(doti, + parameterized_doti, + testing::Combine(testing::ValuesIn(doti_N_range), + testing::ValuesIn(doti_nnz_range), + testing::ValuesIn(doti_idx_base_range))); diff --git a/clients/tests/test_gthr.cpp b/clients/tests/test_gthr.cpp new file mode 100644 index 00000000..9f13e89a --- /dev/null +++ b/clients/tests/test_gthr.cpp @@ -0,0 +1,61 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_gthr.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple gthr_tuple; + +int gthr_N_range[] = {12000, 15332, 22031}; +int gthr_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; + +base gthr_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +class parameterized_gthr : public testing::TestWithParam +{ + protected: + parameterized_gthr() {} + virtual ~parameterized_gthr() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_gthr_arguments(gthr_tuple tup) +{ + Arguments arg; + arg.N = std::get<0>(tup); + arg.nnz = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; + return arg; +} + +TEST(gthr_bad_arg, gthr_float) { testing_gthr_bad_arg(); } + +TEST_P(parameterized_gthr, gthr_float) +{ + Arguments arg = setup_gthr_arguments(GetParam()); + + rocsparse_status status = testing_gthr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_gthr, gthr_double) +{ + Arguments arg = setup_gthr_arguments(GetParam()); + + rocsparse_status status = testing_gthr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(gthr, + parameterized_gthr, + testing::Combine(testing::ValuesIn(gthr_N_range), + testing::ValuesIn(gthr_nnz_range), + testing::ValuesIn(gthr_idx_base_range))); diff --git a/clients/tests/test_gthrz.cpp b/clients/tests/test_gthrz.cpp new file mode 100644 index 00000000..d69e517a --- /dev/null +++ b/clients/tests/test_gthrz.cpp @@ -0,0 +1,61 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_gthrz.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple gthrz_tuple; + +int gthrz_N_range[] = {12000, 15332, 22031}; +int gthrz_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; + +base gthrz_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +class parameterized_gthrz : public testing::TestWithParam +{ + protected: + parameterized_gthrz() {} + virtual ~parameterized_gthrz() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_gthrz_arguments(gthrz_tuple tup) +{ + Arguments arg; + arg.N = std::get<0>(tup); + arg.nnz = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; + return arg; +} + +TEST(gthrz_bad_arg, gthrz_float) { testing_gthrz_bad_arg(); } + +TEST_P(parameterized_gthrz, gthrz_float) +{ + Arguments arg = setup_gthrz_arguments(GetParam()); + + rocsparse_status status = testing_gthrz(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_gthrz, gthrz_double) +{ + Arguments arg = setup_gthrz_arguments(GetParam()); + + rocsparse_status status = testing_gthrz(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(gthrz, + parameterized_gthrz, + testing::Combine(testing::ValuesIn(gthrz_N_range), + testing::ValuesIn(gthrz_nnz_range), + testing::ValuesIn(gthrz_idx_base_range))); From a6e32481378dd373f47cbd34689df92b552e9ad6 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 3 Jun 2018 22:22:26 +0200 Subject: [PATCH 116/304] level1 tests: roti, sctr --- clients/include/testing_roti.hpp | 270 +++++++++++++++++++++++++++++++ clients/include/testing_sctr.hpp | 214 ++++++++++++++++++++++++ clients/tests/CMakeLists.txt | 2 + clients/tests/test_roti.cpp | 68 ++++++++ clients/tests/test_sctr.cpp | 61 +++++++ library/src/level1/roti_device.h | 4 +- 6 files changed, 617 insertions(+), 2 deletions(-) create mode 100644 clients/include/testing_roti.hpp create mode 100644 clients/include/testing_sctr.hpp create mode 100644 clients/tests/test_roti.cpp create mode 100644 clients/tests/test_sctr.cpp diff --git a/clients/include/testing_roti.hpp b/clients/include/testing_roti.hpp new file mode 100644 index 00000000..dafdd031 --- /dev/null +++ b/clients/include/testing_roti.hpp @@ -0,0 +1,270 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_ROTI_HPP +#define TESTING_ROTI_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_roti_bad_arg(void) +{ + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + T c = 3.7; + T s = 1.2; + + rocsparse_index_base idx_base = rocsparse_index_base_zero; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + T* dx_val = (T*)dx_val_managed.get(); + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for(nullptr == dx_ind) + { + rocsparse_int* dx_ind_null = nullptr; + + status = rocsparse_roti(handle, nnz, dx_val, dx_ind_null, dy, &c, &s, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_ind is nullptr"); + } + // testing for(nullptr == dx_val) + { + T* dx_val_null = nullptr; + + status = rocsparse_roti(handle, nnz, dx_val_null, dx_ind, dy, &c, &s, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_val is nullptr"); + } + // testing for(nullptr == dy) + { + T* dy_null = nullptr; + + status = rocsparse_roti(handle, nnz, dx_val, dx_ind, dy_null, &c, &s, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); + } + // testing for(nullptr == c) + { + T* dc_null = nullptr; + + status = rocsparse_roti(handle, nnz, dx_val, dx_ind, dy, dc_null, &s, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: c is nullptr"); + } + // testing for(nullptr == s) + { + T* ds_null = nullptr; + + status = rocsparse_roti(handle, nnz, dx_val, dx_ind, dy, &c, ds_null, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: s is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_roti(handle_null, nnz, dx_val, dx_ind, dy, &c, &s, idx_base); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_roti(Arguments argus) +{ + rocsparse_int N = argus.N; + rocsparse_int nnz = argus.nnz; + T c = argus.alpha; + T s = argus.beta; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + // Argument sanity check before allocating invalid memory + if(nnz <= 0) + { + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dx_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val = (T*)dx_val_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx_ind || !dx_val || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_roti(handle, nnz, dx_val, dx_ind, dy, &c, &s, idx_base); + + if(nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "nnz == 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hx_ind(nnz); + std::vector hx_val_1(nnz); + std::vector hx_val_2(nnz); + std::vector hx_val_gold(nnz); + std::vector hy_1(N); + std::vector hy_2(N); + std::vector hy_gold(N); + + // Initial Data on CPU + srand(12345ULL); + rocsparse_init_index(hx_ind.data(), nnz, 1, N); + rocsparse_init(hx_val_1, 1, nnz); + rocsparse_init(hy_1, 1, N); + + hx_val_2 = hx_val_1; + hx_val_gold = hx_val_1; + hy_2 = hy_1; + hy_gold = hy_1; + + // allocate memory on device + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dx_val_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dx_val_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dc_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto ds_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val_1 = (T*)dx_val_1_managed.get(); + T* dx_val_2 = (T*)dx_val_2_managed.get(); + T* dy_1 = (T*)dy_1_managed.get(); + T* dy_2 = (T*)dy_2_managed.get(); + T* dc = (T*)dc_managed.get(); + T* ds = (T*)ds_managed.get(); + + if(!dx_ind || !dx_val_1 || !dx_val_2 || !dy_1 || !dy_2 || !dc || !ds) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dx_ind || !dx_val_1 || !dx_val_2 || !dy_1 || !dy_2 || !dc || !ds"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR( + hipMemcpy(dx_ind, hx_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx_val_1, hx_val_1.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + CHECK_HIP_ERROR(hipMemcpy(dx_val_2, hx_val_2.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dc, &c, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(ds, &s, sizeof(T), hipMemcpyHostToDevice)); + + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_roti(handle, nnz, dx_val_1, dx_ind, dy_1, &c, &s, idx_base)); + + // ROCSPARSE pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_roti(handle, nnz, dx_val_2, dx_ind, dy_2, dc, ds, idx_base)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hx_val_1.data(), dx_val_1, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hx_val_2.data(), dx_val_2, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * N, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * N, hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + for(rocsparse_int i = 0; i < nnz; ++i) + { + rocsparse_int idx = hx_ind[i] - idx_base; + + T x = hx_val_gold[i]; + T y = hy_gold[idx]; + + hx_val_gold[i] = c * x + s * y; + hy_gold[idx] = c * y - s * x; + } + + cpu_time_used = get_time_us() - cpu_time_used; + + // enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + unit_check_general(1, nnz, hx_val_gold.data(), hx_val_1.data()); + unit_check_general(1, nnz, hx_val_gold.data(), hx_val_2.data()); + unit_check_general(1, N, hy_gold.data(), hy_1.data()); + unit_check_general(1, N, hy_gold.data(), hy_2.data()); + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(rocsparse_int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_roti(handle, nnz, dx_val_1, dx_ind, dy_1, &c, &s, idx_base); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(rocsparse_int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_roti(handle, nnz, dx_val_1, dx_ind, dy_1, &c, &s, idx_base); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; + double gflops = nnz * 6.0 / gpu_time_used / 1e3; + double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; + + printf("nnz\t\tcosine\tsine\tGFlop/s\tGB/s\tusec\n"); + printf("%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + nnz, + c, + s, + gflops, + bandwidth, + gpu_time_used); + } + return rocsparse_status_success; +} + +#endif // TESTING_ROTI_HPP diff --git a/clients/include/testing_sctr.hpp b/clients/include/testing_sctr.hpp new file mode 100644 index 00000000..3829a693 --- /dev/null +++ b/clients/include/testing_sctr.hpp @@ -0,0 +1,214 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_SCTR_HPP +#define TESTING_SCTR_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_sctr_bad_arg(void) +{ + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + + rocsparse_index_base idx_base = rocsparse_index_base_zero; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + T* dx_val = (T*)dx_val_managed.get(); + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for(nullptr == dx_ind) + { + rocsparse_int* dx_ind_null = nullptr; + + status = rocsparse_sctr(handle, nnz, dx_val, dx_ind_null, dy, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_ind is nullptr"); + } + // testing for(nullptr == dx_val) + { + T* dx_val_null = nullptr; + + status = rocsparse_sctr(handle, nnz, dx_val_null, dx_ind, dy, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: x_val is nullptr"); + } + // testing for(nullptr == dy) + { + T* dy_null = nullptr; + + status = rocsparse_sctr(handle, nnz, dx_val, dx_ind, dy_null, idx_base); + verify_rocsparse_status_invalid_pointer(status, "Error: y is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_sctr(handle_null, nnz, dx_val, dx_ind, dy, idx_base); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_sctr(Arguments argus) +{ + rocsparse_int N = argus.N; + rocsparse_int nnz = argus.nnz; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + // Argument sanity check before allocating invalid memory + if(nnz <= 0) + { + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dx_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val = (T*)dx_val_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx_ind || !dx_val || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_sctr(handle, nnz, dx_val, dx_ind, dy, idx_base); + + if(nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "nnz == 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hx_ind(nnz); + std::vector hx_val(nnz); + std::vector hy(N); + std::vector hy_gold(N); + + // Initial Data on CPU + srand(12345ULL); + rocsparse_init_index(hx_ind.data(), nnz, 1, N); + rocsparse_init(hx_val, 1, nnz); + rocsparse_init(hy, 1, N); + + hy_gold = hy; + + // allocate memory on device + auto dx_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + + rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); + T* dx_val = (T*)dx_val_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dx_ind || !dx_val || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx_ind || !dx_val || !dy"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR( + hipMemcpy(dx_ind, hx_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx_val, hx_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N, hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_sctr(handle, nnz, dx_val, dx_ind, dy, idx_base)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hy.data(), dy, sizeof(T) * N, hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + for(rocsparse_int i = 0; i < nnz; ++i) + { + hy_gold[hx_ind[i] - idx_base] = hx_val[i]; + } + + cpu_time_used = get_time_us() - cpu_time_used; + + // enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + unit_check_general(1, N, hy_gold.data(), hy.data()); + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(rocsparse_int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_sctr(handle, nnz, dx_val, dx_ind, dy, idx_base); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(rocsparse_int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_sctr(handle, nnz, dx_val, dx_ind, dy, idx_base); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; + double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; + + printf("nnz\t\tGB/s\tusec\n"); + printf("%9d\t%0.2lf\t%0.2lf\n", + nnz, + bandwidth, + gpu_time_used); + } + return rocsparse_status_success; +} + +#endif // TESTING_SCTR_HPP diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index d9e90885..1299954f 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -10,6 +10,8 @@ set(ROCSPARSE_TEST_SOURCES test_doti.cpp test_gthr.cpp test_gthrz.cpp + test_roti.cpp + test_sctr.cpp test_coomv.cpp test_csrmv.cpp test_ellmv.cpp diff --git a/clients/tests/test_roti.cpp b/clients/tests/test_roti.cpp new file mode 100644 index 00000000..87f46176 --- /dev/null +++ b/clients/tests/test_roti.cpp @@ -0,0 +1,68 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_roti.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple roti_tuple; + +int roti_N_range[] = {12000, 15332, 22031}; +int roti_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; + +double roti_c_range[] = {-2.0, 0.0, 1.0}; +double roti_s_range[] = {-3.0, 0.0, 4.0}; + +base roti_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +class parameterized_roti : public testing::TestWithParam +{ + protected: + parameterized_roti() {} + virtual ~parameterized_roti() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_roti_arguments(roti_tuple tup) +{ + Arguments arg; + arg.N = std::get<0>(tup); + arg.nnz = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.beta = std::get<3>(tup); + arg.idx_base = std::get<4>(tup); + arg.timing = 0; + return arg; +} + +TEST(roti_bad_arg, roti_float) { testing_roti_bad_arg(); } + +TEST_P(parameterized_roti, roti_float) +{ + Arguments arg = setup_roti_arguments(GetParam()); + + rocsparse_status status = testing_roti(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_roti, roti_double) +{ + Arguments arg = setup_roti_arguments(GetParam()); + + rocsparse_status status = testing_roti(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(roti, + parameterized_roti, + testing::Combine(testing::ValuesIn(roti_N_range), + testing::ValuesIn(roti_nnz_range), + testing::ValuesIn(roti_c_range), + testing::ValuesIn(roti_s_range), + testing::ValuesIn(roti_idx_base_range))); diff --git a/clients/tests/test_sctr.cpp b/clients/tests/test_sctr.cpp new file mode 100644 index 00000000..fb398736 --- /dev/null +++ b/clients/tests/test_sctr.cpp @@ -0,0 +1,61 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_sctr.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple sctr_tuple; + +int sctr_N_range[] = {12000, 15332, 22031}; +int sctr_nnz_range[] = {-1, 0, 5, 10, 500, 1000, 7111, 10000}; + +base sctr_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +class parameterized_sctr : public testing::TestWithParam +{ + protected: + parameterized_sctr() {} + virtual ~parameterized_sctr() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_sctr_arguments(sctr_tuple tup) +{ + Arguments arg; + arg.N = std::get<0>(tup); + arg.nnz = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; + return arg; +} + +TEST(sctr_bad_arg, sctr_float) { testing_sctr_bad_arg(); } + +TEST_P(parameterized_sctr, sctr_float) +{ + Arguments arg = setup_sctr_arguments(GetParam()); + + rocsparse_status status = testing_sctr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_sctr, sctr_double) +{ + Arguments arg = setup_sctr_arguments(GetParam()); + + rocsparse_status status = testing_sctr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(sctr, + parameterized_sctr, + testing::Combine(testing::ValuesIn(sctr_N_range), + testing::ValuesIn(sctr_nnz_range), + testing::ValuesIn(sctr_idx_base_range))); diff --git a/library/src/level1/roti_device.h b/library/src/level1/roti_device.h index c74556cf..4150bb74 100644 --- a/library/src/level1/roti_device.h +++ b/library/src/level1/roti_device.h @@ -17,14 +17,14 @@ __device__ void roti_device(rocsparse_int nnz, T s, rocsparse_index_base idx_base) { - int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if(idx >= nnz) { return; } - int i = x_ind[idx - idx_base]; + rocsparse_int i = x_ind[idx] - idx_base; T xr = x_val[idx]; T yr = y[i]; From 13e43e4fdc31a845a1fbf64db77e42402bf8c632 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 3 Jun 2018 22:27:21 +0200 Subject: [PATCH 117/304] clang-format --- clients/include/testing_csr2ell.hpp | 173 +++++++++++++++++++++---- clients/include/testing_doti.hpp | 22 ++-- clients/include/testing_gthr.hpp | 23 ++-- clients/include/testing_gthrz.hpp | 25 ++-- clients/include/testing_roti.hpp | 62 +++++---- clients/include/testing_sctr.hpp | 23 ++-- library/src/level1/doti_device.h | 4 +- library/src/level1/gthrz_device.h | 9 +- library/src/level1/rocsparse_dotci.cpp | 6 +- library/src/level1/rocsparse_doti.cpp | 6 +- library/src/level1/rocsparse_gthr.hpp | 12 +- library/src/level1/rocsparse_gthrz.hpp | 12 +- library/src/level1/rocsparse_roti.hpp | 12 +- library/src/level1/rocsparse_sctr.hpp | 12 +- 14 files changed, 247 insertions(+), 154 deletions(-) diff --git a/clients/include/testing_csr2ell.hpp b/clients/include/testing_csr2ell.hpp index 3abb1695..26657072 100644 --- a/clients/include/testing_csr2ell.hpp +++ b/clients/include/testing_csr2ell.hpp @@ -62,7 +62,8 @@ void testing_csr2ell_bad_arg(void) { rocsparse_int* csr_row_ptr_null = nullptr; - status = rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr_null, ell_descr, &ell_width); + status = + rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr_null, ell_descr, &ell_width); verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); } @@ -70,7 +71,8 @@ void testing_csr2ell_bad_arg(void) { rocsparse_int* ell_width_null = nullptr; - status = rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr, ell_descr, ell_width_null); + status = + rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr, ell_descr, ell_width_null); verify_rocsparse_status_invalid_pointer(status, "Error: ell_width is nullptr"); } @@ -78,7 +80,8 @@ void testing_csr2ell_bad_arg(void) { rocsparse_mat_descr csr_descr_null = nullptr; - status = rocsparse_csr2ell_width(handle, m, csr_descr_null, csr_row_ptr, ell_descr, &ell_width); + status = + rocsparse_csr2ell_width(handle, m, csr_descr_null, csr_row_ptr, ell_descr, &ell_width); verify_rocsparse_status_invalid_pointer(status, "Error: csr_descr is nullptr"); } @@ -86,7 +89,8 @@ void testing_csr2ell_bad_arg(void) { rocsparse_mat_descr ell_descr_null = nullptr; - status = rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr, ell_descr_null, &ell_width); + status = + rocsparse_csr2ell_width(handle, m, csr_descr, csr_row_ptr, ell_descr_null, &ell_width); verify_rocsparse_status_invalid_pointer(status, "Error: ell_descr is nullptr"); } @@ -94,7 +98,8 @@ void testing_csr2ell_bad_arg(void) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csr2ell_width(handle_null, m, csr_descr, csr_row_ptr, ell_descr, &ell_width); + status = + rocsparse_csr2ell_width(handle_null, m, csr_descr, csr_row_ptr, ell_descr, &ell_width); verify_rocsparse_status_invalid_handle(status); } @@ -121,7 +126,16 @@ void testing_csr2ell_bad_arg(void) { rocsparse_int* csr_row_ptr_null = nullptr; - status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr_null, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + status = rocsparse_csr2ell(handle, + m, + csr_descr, + csr_val, + csr_row_ptr_null, + csr_col_ind, + ell_descr, + ell_width, + ell_val, + ell_col_ind); verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); } @@ -129,7 +143,16 @@ void testing_csr2ell_bad_arg(void) { rocsparse_int* csr_col_ind_null = nullptr; - status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind_null, ell_descr, ell_width, ell_val, ell_col_ind); + status = rocsparse_csr2ell(handle, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind_null, + ell_descr, + ell_width, + ell_val, + ell_col_ind); verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); } @@ -137,7 +160,16 @@ void testing_csr2ell_bad_arg(void) { T* csr_val_null = nullptr; - status = rocsparse_csr2ell(handle, m, csr_descr, csr_val_null, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + status = rocsparse_csr2ell(handle, + m, + csr_descr, + csr_val_null, + csr_row_ptr, + csr_col_ind, + ell_descr, + ell_width, + ell_val, + ell_col_ind); verify_rocsparse_status_invalid_pointer(status, "Error: csr_val is nullptr"); } @@ -145,7 +177,16 @@ void testing_csr2ell_bad_arg(void) { rocsparse_int* ell_col_ind_null = nullptr; - status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind_null); + status = rocsparse_csr2ell(handle, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr, + ell_width, + ell_val, + ell_col_ind_null); verify_rocsparse_status_invalid_pointer(status, "Error: ell_col_ind is nullptr"); } @@ -153,7 +194,16 @@ void testing_csr2ell_bad_arg(void) { T* ell_val_null = nullptr; - status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val_null, ell_col_ind); + status = rocsparse_csr2ell(handle, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr, + ell_width, + ell_val_null, + ell_col_ind); verify_rocsparse_status_invalid_pointer(status, "Error: ell_val is nullptr"); } @@ -161,7 +211,16 @@ void testing_csr2ell_bad_arg(void) { rocsparse_mat_descr csr_descr_null = nullptr; - status = rocsparse_csr2ell(handle, m, csr_descr_null, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + status = rocsparse_csr2ell(handle, + m, + csr_descr_null, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr, + ell_width, + ell_val, + ell_col_ind); verify_rocsparse_status_invalid_pointer(status, "Error: csr_descr is nullptr"); } @@ -169,7 +228,16 @@ void testing_csr2ell_bad_arg(void) { rocsparse_mat_descr ell_descr_null = nullptr; - status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr_null, ell_width, ell_val, ell_col_ind); + status = rocsparse_csr2ell(handle, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr_null, + ell_width, + ell_val, + ell_col_ind); verify_rocsparse_status_invalid_pointer(status, "Error: ell_descr is nullptr"); } @@ -177,7 +245,16 @@ void testing_csr2ell_bad_arg(void) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csr2ell(handle_null, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, ell_width, ell_val, ell_col_ind); + status = rocsparse_csr2ell(handle_null, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr, + ell_width, + ell_val, + ell_col_ind); verify_rocsparse_status_invalid_handle(status); } } @@ -217,7 +294,11 @@ rocsparse_status testing_csr2ell(Arguments argus) // Argument sanity check before allocating invalid memory if(m <= 0 || n <= 0 || nnz <= 0) { - auto csr_row_ptr_managed = (m > 0) ? rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free} : rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_row_ptr_managed = + (m > 0) + ? rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free} + : rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), + device_free}; auto csr_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto csr_val_managed = @@ -266,7 +347,16 @@ rocsparse_status testing_csr2ell(Arguments argus) } // Step 2 - status = rocsparse_csr2ell(handle, m, csr_descr, csr_val, csr_row_ptr, csr_col_ind, ell_descr, 0, ell_val, ell_col_ind); + status = rocsparse_csr2ell(handle, + m, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind, + ell_descr, + 0, + ell_val, + ell_col_ind); if(m < 0) { @@ -377,13 +467,13 @@ rocsparse_status testing_csr2ell(Arguments argus) break; } - rocsparse_int idx = ELL_IND(i, p++, m, ell_width_gold); + rocsparse_int idx = ELL_IND(i, p++, m, ell_width_gold); hell_col_ind_gold[idx] = hcsr_col_ind[j] - csr_base + ell_base; hell_val_gold[idx] = hcsr_val[j]; } for(rocsparse_int j = hcsr_row_ptr[i + 1] - hcsr_row_ptr[i]; j < ell_width_gold; ++j) { - rocsparse_int idx = ELL_IND(i, p++, m, ell_width_gold); + rocsparse_int idx = ELL_IND(i, p++, m, ell_width_gold); hell_col_ind_gold[idx] = -1; hell_val_gold[idx] = static_cast(0); } @@ -396,7 +486,8 @@ rocsparse_status testing_csr2ell(Arguments argus) if(argus.unit_check) { - CHECK_ROCSPARSE_ERROR(rocsparse_csr2ell_width(handle, m, csr_descr, dcsr_row_ptr, ell_descr, &ell_width)); + CHECK_ROCSPARSE_ERROR( + rocsparse_csr2ell_width(handle, m, csr_descr, dcsr_row_ptr, ell_descr, &ell_width)); rocsparse_int ell_nnz = ell_width * m; @@ -407,20 +498,30 @@ rocsparse_status testing_csr2ell(Arguments argus) // Allocate ELL device memory auto dell_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * ell_nnz), device_free}; - auto dell_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; + auto dell_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; rocsparse_int* dell_col_ind = (rocsparse_int*)dell_col_ind_managed.get(); T* dell_val = (T*)dell_val_managed.get(); // Perform actual ELL conversion - CHECK_ROCSPARSE_ERROR(rocsparse_csr2ell(handle, m, csr_descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, ell_descr, ell_width, dell_val, dell_col_ind)); + CHECK_ROCSPARSE_ERROR(rocsparse_csr2ell(handle, + m, + csr_descr, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + ell_descr, + ell_width, + dell_val, + dell_col_ind)); CHECK_HIP_ERROR(hipMemcpy(hell_col_ind.data(), dell_col_ind, sizeof(rocsparse_int) * ell_nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy( - hell_val.data(), dell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(hell_val.data(), dell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); // Unit check unit_check_general(1, ell_nnz, hell_col_ind_gold.data(), hell_col_ind.data()); @@ -439,12 +540,22 @@ rocsparse_status testing_csr2ell(Arguments argus) auto dell_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * ell_nnz), device_free}; - auto dell_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; + auto dell_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; rocsparse_int* dell_col_ind = (rocsparse_int*)dell_col_ind_managed.get(); T* dell_val = (T*)dell_val_managed.get(); - rocsparse_csr2ell(handle, m, csr_descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, ell_descr, ell_width, dell_val, dell_col_ind); + rocsparse_csr2ell(handle, + m, + csr_descr, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + ell_descr, + ell_width, + dell_val, + dell_col_ind); } double gpu_time_used = get_time_us(); @@ -456,12 +567,22 @@ rocsparse_status testing_csr2ell(Arguments argus) auto dell_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * ell_nnz), device_free}; - auto dell_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; + auto dell_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * ell_nnz), device_free}; rocsparse_int* dell_col_ind = (rocsparse_int*)dell_col_ind_managed.get(); T* dell_val = (T*)dell_val_managed.get(); - rocsparse_csr2ell(handle, m, csr_descr, dcsr_val, dcsr_row_ptr, dcsr_col_ind, ell_descr, ell_width, dell_val, dell_col_ind); + rocsparse_csr2ell(handle, + m, + csr_descr, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + ell_descr, + ell_width, + dell_val, + dell_col_ind); } gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); diff --git a/clients/include/testing_doti.hpp b/clients/include/testing_doti.hpp index 8ea29a12..16e44aa4 100644 --- a/clients/include/testing_doti.hpp +++ b/clients/include/testing_doti.hpp @@ -35,7 +35,7 @@ void testing_doti_bad_arg(void) T* dx_val = (T*)dx_val_managed.get(); rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -109,7 +109,7 @@ rocsparse_status testing_doti(Arguments argus) rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val = (T*)dx_val_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -153,13 +153,13 @@ rocsparse_status testing_doti(Arguments argus) // allocate memory on device auto dx_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; - auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; auto dresult_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val = (T*)dx_val_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); T* dresult_2 = (T*)dresult_2_managed.get(); if(!dx_ind || !dx_val || !dy || !dresult_2) @@ -179,7 +179,8 @@ rocsparse_status testing_doti(Arguments argus) { // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR(rocsparse_doti(handle, nnz, dx_val, dx_ind, dy, &hresult_1, idx_base)); + CHECK_ROCSPARSE_ERROR( + rocsparse_doti(handle, nnz, dx_val, dx_ind, dy, &hresult_1, idx_base)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); @@ -225,14 +226,11 @@ rocsparse_status testing_doti(Arguments argus) gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; double gpu_gflops = (2.0 * nnz) / 1e9 / gpu_time_used * 1e6 * 1; - double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * nnz * 2.0) / gpu_time_used / 1e3; + double bandwidth = + (sizeof(rocsparse_int) * nnz + sizeof(T) * nnz * 2.0) / gpu_time_used / 1e3; printf("nnz\t\tGFlops\tGB/s\tusec\n"); - printf("%9d\t%0.2lf\t%0.2lf\t%0.2lf\n", - nnz, - gpu_gflops, - bandwidth, - gpu_time_used); + printf("%9d\t%0.2lf\t%0.2lf\t%0.2lf\n", nnz, gpu_gflops, bandwidth, gpu_time_used); } return rocsparse_status_success; } diff --git a/clients/include/testing_gthr.hpp b/clients/include/testing_gthr.hpp index 4f86d4d9..9cefd4a8 100644 --- a/clients/include/testing_gthr.hpp +++ b/clients/include/testing_gthr.hpp @@ -35,7 +35,7 @@ void testing_gthr_bad_arg(void) T* dx_val = (T*)dx_val_managed.get(); rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -96,7 +96,7 @@ rocsparse_status testing_gthr(Arguments argus) rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val = (T*)dx_val_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -134,17 +134,16 @@ rocsparse_status testing_gthr(Arguments argus) // allocate memory on device auto dx_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; - auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val = (T*)dx_val_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { - verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dx_ind || !dx_val || !dy"); + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dx_ind || !dx_val || !dy"); return rocsparse_status_memory_error; } @@ -195,14 +194,12 @@ rocsparse_status testing_gthr(Arguments argus) rocsparse_gthr(handle, nnz, dy, dx_val, dx_ind, idx_base); } - gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; - double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; + gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; + double bandwidth = + (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; printf("nnz\t\tGB/s\tusec\n"); - printf("%9d\t%0.2lf\t%0.2lf\n", - nnz, - bandwidth, - gpu_time_used); + printf("%9d\t%0.2lf\t%0.2lf\n", nnz, bandwidth, gpu_time_used); } return rocsparse_status_success; } diff --git a/clients/include/testing_gthrz.hpp b/clients/include/testing_gthrz.hpp index a6726bf7..072e1fa3 100644 --- a/clients/include/testing_gthrz.hpp +++ b/clients/include/testing_gthrz.hpp @@ -35,7 +35,7 @@ void testing_gthrz_bad_arg(void) T* dx_val = (T*)dx_val_managed.get(); rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -96,7 +96,7 @@ rocsparse_status testing_gthrz(Arguments argus) rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val = (T*)dx_val_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -137,17 +137,16 @@ rocsparse_status testing_gthrz(Arguments argus) // allocate memory on device auto dx_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; - auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val = (T*)dx_val_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { - verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dx_ind || !dx_val || !dy"); + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dx_ind || !dx_val || !dy"); return rocsparse_status_memory_error; } @@ -171,7 +170,7 @@ rocsparse_status testing_gthrz(Arguments argus) for(rocsparse_int i = 0; i < nnz; ++i) { - hx_val_gold[i] = hy_gold[hx_ind[i] - idx_base]; + hx_val_gold[i] = hy_gold[hx_ind[i] - idx_base]; hy_gold[hx_ind[i] - idx_base] = static_cast(0); } @@ -201,14 +200,12 @@ rocsparse_status testing_gthrz(Arguments argus) rocsparse_gthrz(handle, nnz, dy, dx_val, dx_ind, idx_base); } - gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; - double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; + gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; + double bandwidth = + (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; printf("nnz\t\tGB/s\tusec\n"); - printf("%9d\t%0.2lf\t%0.2lf\n", - nnz, - bandwidth, - gpu_time_used); + printf("%9d\t%0.2lf\t%0.2lf\n", nnz, bandwidth, gpu_time_used); } return rocsparse_status_success; } diff --git a/clients/include/testing_roti.hpp b/clients/include/testing_roti.hpp index dafdd031..b2e6ebe2 100644 --- a/clients/include/testing_roti.hpp +++ b/clients/include/testing_roti.hpp @@ -21,8 +21,8 @@ void testing_roti_bad_arg(void) { rocsparse_int nnz = 100; rocsparse_int safe_size = 100; - T c = 3.7; - T s = 1.2; + T c = 3.7; + T s = 1.2; rocsparse_index_base idx_base = rocsparse_index_base_zero; rocsparse_status status; @@ -37,7 +37,7 @@ void testing_roti_bad_arg(void) T* dx_val = (T*)dx_val_managed.get(); rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -94,8 +94,8 @@ rocsparse_status testing_roti(Arguments argus) { rocsparse_int N = argus.N; rocsparse_int nnz = argus.nnz; - T c = argus.alpha; - T s = argus.beta; + T c = argus.alpha; + T s = argus.beta; rocsparse_int safe_size = 100; rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; @@ -114,7 +114,7 @@ rocsparse_status testing_roti(Arguments argus) rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val = (T*)dx_val_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -153,32 +153,34 @@ rocsparse_status testing_roti(Arguments argus) rocsparse_init(hx_val_1, 1, nnz); rocsparse_init(hy_1, 1, N); - hx_val_2 = hx_val_1; + hx_val_2 = hx_val_1; hx_val_gold = hx_val_1; - hy_2 = hy_1; - hy_gold = hy_1; + hy_2 = hy_1; + hy_gold = hy_1; // allocate memory on device auto dx_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; - auto dx_val_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; - auto dx_val_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; - auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; - auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; - auto dc_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; - auto ds_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto dx_val_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dx_val_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dc_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto ds_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val_1 = (T*)dx_val_1_managed.get(); T* dx_val_2 = (T*)dx_val_2_managed.get(); - T* dy_1 = (T*)dy_1_managed.get(); - T* dy_2 = (T*)dy_2_managed.get(); - T* dc = (T*)dc_managed.get(); - T* ds = (T*)ds_managed.get(); + T* dy_1 = (T*)dy_1_managed.get(); + T* dy_2 = (T*)dy_2_managed.get(); + T* dc = (T*)dc_managed.get(); + T* ds = (T*)ds_managed.get(); if(!dx_ind || !dx_val_1 || !dx_val_2 || !dy_1 || !dy_2 || !dc || !ds) { - verify_rocsparse_status_success(rocsparse_status_memory_error, "!dx_ind || !dx_val_1 || !dx_val_2 || !dy_1 || !dy_2 || !dc || !ds"); + verify_rocsparse_status_success( + rocsparse_status_memory_error, + "!dx_ind || !dx_val_1 || !dx_val_2 || !dy_1 || !dy_2 || !dc || !ds"); return rocsparse_status_memory_error; } @@ -190,22 +192,27 @@ rocsparse_status testing_roti(Arguments argus) if(argus.unit_check) { - CHECK_HIP_ERROR(hipMemcpy(dx_val_2, hx_val_2.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dx_val_2, hx_val_2.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * N, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dc, &c, sizeof(T), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(ds, &s, sizeof(T), hipMemcpyHostToDevice)); // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR(rocsparse_roti(handle, nnz, dx_val_1, dx_ind, dy_1, &c, &s, idx_base)); + CHECK_ROCSPARSE_ERROR( + rocsparse_roti(handle, nnz, dx_val_1, dx_ind, dy_1, &c, &s, idx_base)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); - CHECK_ROCSPARSE_ERROR(rocsparse_roti(handle, nnz, dx_val_2, dx_ind, dy_2, dc, ds, idx_base)); + CHECK_ROCSPARSE_ERROR( + rocsparse_roti(handle, nnz, dx_val_2, dx_ind, dy_2, dc, ds, idx_base)); // copy output from device to CPU - CHECK_HIP_ERROR(hipMemcpy(hx_val_1.data(), dx_val_1, sizeof(T) * nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hx_val_2.data(), dx_val_2, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(hx_val_1.data(), dx_val_1, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(hx_val_2.data(), dx_val_2, sizeof(T) * nnz, hipMemcpyDeviceToHost)); CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * N, hipMemcpyDeviceToHost)); CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * N, hipMemcpyDeviceToHost)); @@ -251,9 +258,10 @@ rocsparse_status testing_roti(Arguments argus) rocsparse_roti(handle, nnz, dx_val_1, dx_ind, dy_1, &c, &s, idx_base); } - gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; + gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; double gflops = nnz * 6.0 / gpu_time_used / 1e3; - double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; + double bandwidth = + (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; printf("nnz\t\tcosine\tsine\tGFlop/s\tGB/s\tusec\n"); printf("%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", diff --git a/clients/include/testing_sctr.hpp b/clients/include/testing_sctr.hpp index 3829a693..b7a781e9 100644 --- a/clients/include/testing_sctr.hpp +++ b/clients/include/testing_sctr.hpp @@ -35,7 +35,7 @@ void testing_sctr_bad_arg(void) T* dx_val = (T*)dx_val_managed.get(); rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -96,7 +96,7 @@ rocsparse_status testing_sctr(Arguments argus) rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val = (T*)dx_val_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { @@ -137,17 +137,16 @@ rocsparse_status testing_sctr(Arguments argus) // allocate memory on device auto dx_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; - auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; + auto dx_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * N), device_free}; rocsparse_int* dx_ind = (rocsparse_int*)dx_ind_managed.get(); T* dx_val = (T*)dx_val_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dy = (T*)dy_managed.get(); if(!dx_ind || !dx_val || !dy) { - verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dx_ind || !dx_val || !dy"); + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dx_ind || !dx_val || !dy"); return rocsparse_status_memory_error; } @@ -199,14 +198,12 @@ rocsparse_status testing_sctr(Arguments argus) rocsparse_sctr(handle, nnz, dx_val, dx_ind, dy, idx_base); } - gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; - double bandwidth = (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; + gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; + double bandwidth = + (sizeof(rocsparse_int) * nnz + sizeof(T) * 2.0 * nnz) / gpu_time_used / 1e3; printf("nnz\t\tGB/s\tusec\n"); - printf("%9d\t%0.2lf\t%0.2lf\n", - nnz, - bandwidth, - gpu_time_used); + printf("%9d\t%0.2lf\t%0.2lf\n", nnz, bandwidth, gpu_time_used); } return rocsparse_status_success; } diff --git a/library/src/level1/doti_device.h b/library/src/level1/doti_device.h index 28f25173..625d22b4 100644 --- a/library/src/level1/doti_device.h +++ b/library/src/level1/doti_device.h @@ -57,9 +57,7 @@ __global__ void doti_kernel_part1(rocsparse_int nnz, } template -__global__ void doti_kernel_part2(rocsparse_int n, - T* workspace, - T* result) +__global__ void doti_kernel_part2(rocsparse_int n, T* workspace, T* result) { rocsparse_int tid = hipThreadIdx_x; diff --git a/library/src/level1/gthrz_device.h b/library/src/level1/gthrz_device.h index 1ffa444f..3457dae0 100644 --- a/library/src/level1/gthrz_device.h +++ b/library/src/level1/gthrz_device.h @@ -9,11 +9,8 @@ #include template -__global__ void gthrz_kernel(rocsparse_int nnz, - T* y, - T* x_val, - const rocsparse_int* x_ind, - rocsparse_index_base idx_base) +__global__ void gthrz_kernel( + rocsparse_int nnz, T* y, T* x_val, const rocsparse_int* x_ind, rocsparse_index_base idx_base) { int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -25,7 +22,7 @@ __global__ void gthrz_kernel(rocsparse_int nnz, rocsparse_int i = x_ind[idx] - idx_base; x_val[idx] = y[i]; - y[i] = static_cast(0); + y[i] = static_cast(0); } #endif // GTHRZ_DEVICE_H diff --git a/library/src/level1/rocsparse_dotci.cpp b/library/src/level1/rocsparse_dotci.cpp index c5226404..f99b3c30 100644 --- a/library/src/level1/rocsparse_dotci.cpp +++ b/library/src/level1/rocsparse_dotci.cpp @@ -19,7 +19,8 @@ extern "C" rocsparse_status rocsparse_cdotci(rocsparse_handle handle, rocsparse_float_complex* result, rocsparse_index_base idx_base) { - return rocsparse_dotci_template(handle, nnz, x_val, x_ind, y, result, idx_base); + return rocsparse_dotci_template(handle, nnz, x_val, x_ind, y, result, +idx_base); } extern "C" rocsparse_status rocsparse_zdotci(rocsparse_handle handle, @@ -30,6 +31,7 @@ extern "C" rocsparse_status rocsparse_zdotci(rocsparse_handle handle, rocsparse_double_complex* result, rocsparse_index_base idx_base) { - return rocsparse_dotci_template(handle, nnz, x_val, x_ind, y, result, idx_base); + return rocsparse_dotci_template(handle, nnz, x_val, x_ind, y, result, +idx_base); } */ diff --git a/library/src/level1/rocsparse_doti.cpp b/library/src/level1/rocsparse_doti.cpp index 5b2e4703..d09b5401 100644 --- a/library/src/level1/rocsparse_doti.cpp +++ b/library/src/level1/rocsparse_doti.cpp @@ -41,7 +41,8 @@ extern "C" rocsparse_status rocsparse_sdoti(rocsparse_handle handle, rocsparse_float_complex* result, rocsparse_index_base idx_base) { - return rocsparse_doti_template(handle, nnz, x_val, x_ind, y, result, idx_base); + return rocsparse_doti_template(handle, nnz, x_val, x_ind, y, result, +idx_base); } extern "C" rocsparse_status rocsparse_ddoti(rocsparse_handle handle, @@ -52,6 +53,7 @@ extern "C" rocsparse_status rocsparse_ddoti(rocsparse_handle handle, rocsparse_double_complex* result, rocsparse_index_base idx_base) { - return rocsparse_doti_template(handle, nnz, x_val, x_ind, y, result, idx_base); + return rocsparse_doti_template(handle, nnz, x_val, x_ind, y, result, +idx_base); } */ diff --git a/library/src/level1/rocsparse_gthr.hpp b/library/src/level1/rocsparse_gthr.hpp index 599f8b6a..e57dc99e 100644 --- a/library/src/level1/rocsparse_gthr.hpp +++ b/library/src/level1/rocsparse_gthr.hpp @@ -75,16 +75,8 @@ rocsparse_status rocsparse_gthr_template(rocsparse_handle handle, dim3 gthr_blocks((nnz - 1) / GTHR_DIM + 1); dim3 gthr_threads(GTHR_DIM); - hipLaunchKernelGGL((gthr_kernel), - gthr_blocks, - gthr_threads, - 0, - stream, - nnz, - y, - x_val, - x_ind, - idx_base); + hipLaunchKernelGGL( + (gthr_kernel), gthr_blocks, gthr_threads, 0, stream, nnz, y, x_val, x_ind, idx_base); #undef GTHR_DIM return rocsparse_status_success; } diff --git a/library/src/level1/rocsparse_gthrz.hpp b/library/src/level1/rocsparse_gthrz.hpp index 32243ef1..b8bb3652 100644 --- a/library/src/level1/rocsparse_gthrz.hpp +++ b/library/src/level1/rocsparse_gthrz.hpp @@ -75,16 +75,8 @@ rocsparse_status rocsparse_gthrz_template(rocsparse_handle handle, dim3 gthrz_blocks((nnz - 1) / GTHRZ_DIM + 1); dim3 gthrz_threads(GTHRZ_DIM); - hipLaunchKernelGGL((gthrz_kernel), - gthrz_blocks, - gthrz_threads, - 0, - stream, - nnz, - y, - x_val, - x_ind, - idx_base); + hipLaunchKernelGGL( + (gthrz_kernel), gthrz_blocks, gthrz_threads, 0, stream, nnz, y, x_val, x_ind, idx_base); #undef GTHRZ_DIM return rocsparse_status_success; } diff --git a/library/src/level1/rocsparse_roti.hpp b/library/src/level1/rocsparse_roti.hpp index 0a82122d..815318a1 100644 --- a/library/src/level1/rocsparse_roti.hpp +++ b/library/src/level1/rocsparse_roti.hpp @@ -27,12 +27,12 @@ __global__ void roti_kernel_host_scalar(rocsparse_int nnz, template __global__ void roti_kernel_device_scalar(rocsparse_int nnz, - T* x_val, - const rocsparse_int* x_ind, - T* y, - const T* c, - const T* s, - rocsparse_index_base idx_base) + T* x_val, + const rocsparse_int* x_ind, + T* y, + const T* c, + const T* s, + rocsparse_index_base idx_base) { if(*c == static_cast(1) && *s == static_cast(0)) { diff --git a/library/src/level1/rocsparse_sctr.hpp b/library/src/level1/rocsparse_sctr.hpp index afebb4ab..9ae50a5e 100644 --- a/library/src/level1/rocsparse_sctr.hpp +++ b/library/src/level1/rocsparse_sctr.hpp @@ -75,16 +75,8 @@ rocsparse_status rocsparse_sctr_template(rocsparse_handle handle, dim3 sctr_blocks((nnz - 1) / SCTR_DIM + 1); dim3 sctr_threads(SCTR_DIM); - hipLaunchKernelGGL((sctr_kernel), - sctr_blocks, - sctr_threads, - 0, - stream, - nnz, - x_val, - x_ind, - y, - idx_base); + hipLaunchKernelGGL( + (sctr_kernel), sctr_blocks, sctr_threads, 0, stream, nnz, x_val, x_ind, y, idx_base); #undef SCTR_DIM return rocsparse_status_success; } From c6d389ea3191ae58e8b7208b22190e08e2a2887a Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 4 Jun 2018 07:57:07 +0200 Subject: [PATCH 118/304] conversions: added create_identity_permutation --- library/include/rocsparse-functions.h | 20 +++++++ library/src/CMakeLists.txt | 1 + library/src/conversion/identity_device.h | 24 ++++++++ library/src/conversion/rocsparse_identity.cpp | 60 +++++++++++++++++++ 4 files changed, 105 insertions(+) create mode 100644 library/src/conversion/identity_device.h create mode 100644 library/src/conversion/rocsparse_identity.cpp diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 5002814e..7a970d1c 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -1071,6 +1071,26 @@ rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, rocsparse_int* csr_row_ptr, rocsparse_index_base idx_base); +/*! \brief SPARSE Format Conversions API + + \details + create_identity_permutation stores the identity map in array p + + p = 0:1:(n-1) + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + n size of the map p + @param[out] + p array of n integers containing the map + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status +rocsparse_create_identity_permutation(rocsparse_handle handle, rocsparse_int n, rocsparse_int* p); + #ifdef __cplusplus } #endif diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index d79dec8b..c12f74a0 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -28,4 +28,5 @@ set(rocsparse_source src/conversion/rocsparse_csr2ell.cpp src/conversion/rocsparse_csr2hyb.cpp src/conversion/rocsparse_coo2csr.cpp + src/conversion/rocsparse_identity.cpp ) diff --git a/library/src/conversion/identity_device.h b/library/src/conversion/identity_device.h new file mode 100644 index 00000000..5bf31520 --- /dev/null +++ b/library/src/conversion/identity_device.h @@ -0,0 +1,24 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef IDENTITY_DEVICE_H +#define IDENTITY_DEVICE_H + +#include + +// Create identity permutation +__global__ void identity_kernel(rocsparse_int n, rocsparse_int* p) +{ + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(gid >= n) + { + return; + } + + p[gid] = gid; +} + +#endif // IDENTITY_DEVICE_H diff --git a/library/src/conversion/rocsparse_identity.cpp b/library/src/conversion/rocsparse_identity.cpp new file mode 100644 index 00000000..3c0311fa --- /dev/null +++ b/library/src/conversion/rocsparse_identity.cpp @@ -0,0 +1,60 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "identity_device.h" + +#include + +extern "C" rocsparse_status rocsparse_create_identity_permutation(rocsparse_handle handle, rocsparse_int n, rocsparse_int* p) +{ + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_create_identity_permutation", + n, + (const void*&)p); + + // Check sizes + if(n < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(p == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(n == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define IDENTITY_DIM 512 + dim3 identity_blocks((n - 1) / IDENTITY_DIM + 1); + dim3 identity_threads(IDENTITY_DIM); + + hipLaunchKernelGGL((identity_kernel), + identity_blocks, + identity_threads, + 0, + stream, + n, + p); +#undef IDENTITY_DIM + return rocsparse_status_success; +} From 5401255c4d5e592571a5d7d4e9961038265bd76a Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 4 Jun 2018 07:57:39 +0200 Subject: [PATCH 119/304] tests: create_identity_permutation --- clients/include/testing_identity.hpp | 153 +++++++++++++++++++++++++++ clients/tests/CMakeLists.txt | 1 + clients/tests/test_identity.cpp | 41 +++++++ 3 files changed, 195 insertions(+) create mode 100644 clients/include/testing_identity.hpp create mode 100644 clients/tests/test_identity.cpp diff --git a/clients/include/testing_identity.hpp b/clients/include/testing_identity.hpp new file mode 100644 index 00000000..c0d969bc --- /dev/null +++ b/clients/include/testing_identity.hpp @@ -0,0 +1,153 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_IDENTITY_HPP +#define TESTING_IDENTITY_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +void testing_identity_bad_arg(void) +{ + rocsparse_int n = 100; + rocsparse_int safe_size = 100; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto p_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + + rocsparse_int* p = (rocsparse_int*)p_managed.get(); + + if(!p) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // Testing for (p == nullptr) + { + rocsparse_int* p_null = nullptr; + + status = rocsparse_create_identity_permutation(handle, n, p_null); + verify_rocsparse_status_invalid_pointer(status, "Error: p is nullptr"); + } + + // Testing for(handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_create_identity_permutation(handle_null, n, p); + verify_rocsparse_status_invalid_handle(status); + } +} + +rocsparse_status testing_identity(Arguments argus) +{ + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + // Argument sanity check before allocating invalid memory + if(n <= 0) + { + auto p_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + + rocsparse_int* p = (rocsparse_int*)p_managed.get(); + + if(!p) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!p"); + return rocsparse_status_memory_error; + } + + status = rocsparse_create_identity_permutation(handle, n, p); + + if(n < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: n < 0"); + } + else + { + verify_rocsparse_status_success(status, "n >= 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hp(n); + std::vector hp_gold(n); + + // create_identity_permutation on host + for(rocsparse_int i = 0; i < n; ++i) + { + hp_gold[i] = i; + } + + // Allocate memory on the device + auto dp_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * n), device_free}; + + rocsparse_int* dp = (rocsparse_int*)dp_managed.get(); + + if(!dp) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!p"); + return rocsparse_status_memory_error; + } + + if(argus.unit_check) + { + CHECK_ROCSPARSE_ERROR(rocsparse_create_identity_permutation(handle, n, dp)); + + // Copy output from device to host + CHECK_HIP_ERROR(hipMemcpy(hp.data(), dp, sizeof(rocsparse_int) * n, hipMemcpyDeviceToHost)); + + // Unit check + unit_check_general(1, n, hp_gold.data(), hp.data()); + } + + if(argus.timing) + { + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; + + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_create_identity_permutation(handle, n, dp); + } + + double gpu_time_used = get_time_us(); + + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_create_identity_permutation(handle, n, dp); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + + double bandwidth = sizeof(rocsparse_int) * n / gpu_time_used / 1e6; + + printf("n\t\tGB/s\tmsec\n"); + printf("%8d\t%0.2lf\t%0.2lf\n", n, bandwidth, gpu_time_used); + } + return rocsparse_status_success; +} + +#endif // TESTING_IDENTITY_HPP diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 1299954f..83dd4f7b 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -20,6 +20,7 @@ set(ROCSPARSE_TEST_SOURCES test_csr2ell.cpp test_csr2hyb.cpp test_coo2csr.cpp + test_identity.cpp ) set(ROCSPARSE_CLIENTS_COMMON diff --git a/clients/tests/test_identity.cpp b/clients/tests/test_identity.cpp new file mode 100644 index 00000000..3f5542de --- /dev/null +++ b/clients/tests/test_identity.cpp @@ -0,0 +1,41 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_identity.hpp" +#include "utility.hpp" + +#include +#include +#include + +int identity_N_range[] = {-3, 0, 33, 242, 623, 1000}; + +class parameterized_identity : public testing::TestWithParam +{ + protected: + parameterized_identity() {} + virtual ~parameterized_identity() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_identity_arguments(int n) +{ + Arguments arg; + arg.N = n; + arg.timing = 0; + return arg; +} + +TEST(identity_bad_arg, identity) { testing_identity_bad_arg(); } + +TEST_P(parameterized_identity, identity) +{ + Arguments arg = setup_identity_arguments(GetParam()); + + rocsparse_status status = testing_identity(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(identity, parameterized_identity, testing::ValuesIn(identity_N_range)); From 712e60390af70d81804712317788e4b14a284448 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 4 Jun 2018 08:13:56 +0200 Subject: [PATCH 120/304] level1: benchmarks --- clients/benchmarks/client.cpp | 56 +++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index a59bed4e..62c89ef8 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -4,15 +4,27 @@ #include "utility.hpp" #include "rocsparse.hpp" + +// Level1 +#include "testing_axpyi.hpp" +#include "testing_doti.hpp" +#include "testing_gthr.hpp" +#include "testing_gthrz.hpp" +#include "testing_roti.hpp" +#include "testing_sctr.hpp" + +// Level2 #include "testing_coomv.hpp" #include "testing_csrmv.hpp" #include "testing_ellmv.hpp" #include "testing_hybmv.hpp" -#include "testing_axpyi.hpp" + +// Conversion #include "testing_csr2coo.hpp" #include "testing_csr2ell.hpp" #include "testing_csr2hyb.hpp" #include "testing_coo2csr.hpp" +#include "testing_identity.hpp" #include #include @@ -67,8 +79,10 @@ int main(int argc, char* argv[]) ("function,f", po::value(&function)->default_value("axpyi"), - "SPARSE function to test. Options: axpyi, coomv, csrmv, ellmv, hybmv, csr2coo, " - "csr2ell, csr2hyb, coo2csr") + "SPARSE function to test. Options:\n" + " Level1: axpyi, doti, gthr, gthrz, roti, sctr\n" + " Level2: coomv, csrmv, ellmv, hybmv\n" + " Conversion: csr2coo, csr2ell, csr2hyb, coo2csr") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") @@ -123,6 +137,7 @@ int main(int argc, char* argv[]) return -1; } + // Level1 if(function == "axpyi") { if(precision == 's') @@ -130,6 +145,41 @@ int main(int argc, char* argv[]) else if(precision == 'd') testing_axpyi(argus); } + else if(function == "doti") + { + if(precision == 's') + testing_doti(argus); + else if(precision == 'd') + testing_doti(argus); + } + else if(function == "gthr") + { + if(precision == 's') + testing_gthr(argus); + else if(precision == 'd') + testing_gthr(argus); + } + else if(function == "gthrz") + { + if(precision == 's') + testing_gthrz(argus); + else if(precision == 'd') + testing_gthrz(argus); + } + else if(function == "roti") + { + if(precision == 's') + testing_roti(argus); + else if(precision == 'd') + testing_roti(argus); + } + else if(function == "sctr") + { + if(precision == 's') + testing_sctr(argus); + else if(precision == 'd') + testing_sctr(argus); + } else if(function == "coomv") { if(precision == 's') From d7e2e6daf1010a0d02024b07553b846f33544a11 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 4 Jun 2018 08:56:23 +0200 Subject: [PATCH 121/304] int -> rocsparse_int --- library/src/conversion/csr2ell_device.h | 2 +- library/src/conversion/csr2hyb_device.h | 2 +- library/src/level1/axpyi_device.h | 2 +- library/src/level1/gthr_device.h | 2 +- library/src/level1/gthrz_device.h | 2 +- library/src/level1/sctr_device.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/library/src/conversion/csr2ell_device.h b/library/src/conversion/csr2ell_device.h index b373d3e5..98219238 100644 --- a/library/src/conversion/csr2ell_device.h +++ b/library/src/conversion/csr2ell_device.h @@ -16,7 +16,7 @@ __device__ void ell_width_reduce(rocsparse_int tid, rocsparse_int* data) { __syncthreads(); - for(int i = NB >> 1; i > 0; i >>= 1) + for(rocsparse_int i = NB >> 1; i > 0; i >>= 1) { if(tid < i) { diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index 0b9ab1cf..494199bb 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -16,7 +16,7 @@ __device__ void sum_reduce(rocsparse_int tid, rocsparse_int* data) { __syncthreads(); - for(int i = NB >> 1; i > 0; i >>= 1) + for(rocsparse_int i = NB >> 1; i > 0; i >>= 1) { if(tid < i) { diff --git a/library/src/level1/axpyi_device.h b/library/src/level1/axpyi_device.h index aac0f269..28c5b2e9 100644 --- a/library/src/level1/axpyi_device.h +++ b/library/src/level1/axpyi_device.h @@ -17,7 +17,7 @@ __device__ void axpyi_device(rocsparse_int nnz, T* y, rocsparse_index_base idx_base) { - int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if(idx >= nnz) { diff --git a/library/src/level1/gthr_device.h b/library/src/level1/gthr_device.h index d0e74cdb..0a52226c 100644 --- a/library/src/level1/gthr_device.h +++ b/library/src/level1/gthr_device.h @@ -15,7 +15,7 @@ __global__ void gthr_kernel(rocsparse_int nnz, const rocsparse_int* x_ind, rocsparse_index_base idx_base) { - int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if(idx >= nnz) { diff --git a/library/src/level1/gthrz_device.h b/library/src/level1/gthrz_device.h index 3457dae0..458941e8 100644 --- a/library/src/level1/gthrz_device.h +++ b/library/src/level1/gthrz_device.h @@ -12,7 +12,7 @@ template __global__ void gthrz_kernel( rocsparse_int nnz, T* y, T* x_val, const rocsparse_int* x_ind, rocsparse_index_base idx_base) { - int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if(idx >= nnz) { diff --git a/library/src/level1/sctr_device.h b/library/src/level1/sctr_device.h index b4c6ac62..1fcc4304 100644 --- a/library/src/level1/sctr_device.h +++ b/library/src/level1/sctr_device.h @@ -15,7 +15,7 @@ __global__ void sctr_kernel(rocsparse_int nnz, T* y, rocsparse_index_base idx_base) { - int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if(idx >= nnz) { From 12579bb2f2afa869e85e75c7975312731e2c736b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 5 Jun 2018 08:23:02 +0200 Subject: [PATCH 122/304] added clang format check in CI --- Jenkinsfile | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6009f66a..69b51165 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -241,19 +241,19 @@ def docker_build_inside_image( def build_image, compiler_data compiler_args, doc """ archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true -// stage('Clang Format') -// { -// sh ''' -// find . -iname \'*.h\' \ -// -o -iname \'*.hpp\' \ -// -o -iname \'*.cpp\' \ -// -o -iname \'*.h.in\' \ -// -o -iname \'*.hpp.in\' \ -// -o -iname \'*.cpp.in\' \ -// | grep -v 'build/' \ -// | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' -// ''' -// } + stage('Clang Format') + { + sh ''' + find . -iname \'*.h\' \ + -o -iname \'*.hpp\' \ + -o -iname \'*.cpp\' \ + -o -iname \'*.h.in\' \ + -o -iname \'*.hpp.in\' \ + -o -iname \'*.cpp.in\' \ + | grep -v 'build/' \ + | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' + ''' + } } else if( paths.project_name.equalsIgnoreCase( 'rocsparse-fedora' ) ) { From 05d5f0164dfb5e8bc3af6c7658ff24daa6e5a7e9 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 5 Jun 2018 08:47:00 +0200 Subject: [PATCH 123/304] clang format fix --- library/src/conversion/identity_device.h | 2 +- library/src/conversion/rocsparse_identity.cpp | 16 ++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/library/src/conversion/identity_device.h b/library/src/conversion/identity_device.h index 5bf31520..f59930eb 100644 --- a/library/src/conversion/identity_device.h +++ b/library/src/conversion/identity_device.h @@ -11,7 +11,7 @@ // Create identity permutation __global__ void identity_kernel(rocsparse_int n, rocsparse_int* p) { - rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if(gid >= n) { diff --git a/library/src/conversion/rocsparse_identity.cpp b/library/src/conversion/rocsparse_identity.cpp index 3c0311fa..fab8bd6b 100644 --- a/library/src/conversion/rocsparse_identity.cpp +++ b/library/src/conversion/rocsparse_identity.cpp @@ -9,7 +9,8 @@ #include -extern "C" rocsparse_status rocsparse_create_identity_permutation(rocsparse_handle handle, rocsparse_int n, rocsparse_int* p) +extern "C" rocsparse_status +rocsparse_create_identity_permutation(rocsparse_handle handle, rocsparse_int n, rocsparse_int* p) { // Check for valid handle if(handle == nullptr) @@ -18,10 +19,7 @@ extern "C" rocsparse_status rocsparse_create_identity_permutation(rocsparse_hand } // Logging TODO bench logging - log_trace(handle, - "rocsparse_create_identity_permutation", - n, - (const void*&)p); + log_trace(handle, "rocsparse_create_identity_permutation", n, (const void*&)p); // Check sizes if(n < 0) @@ -48,13 +46,7 @@ extern "C" rocsparse_status rocsparse_create_identity_permutation(rocsparse_hand dim3 identity_blocks((n - 1) / IDENTITY_DIM + 1); dim3 identity_threads(IDENTITY_DIM); - hipLaunchKernelGGL((identity_kernel), - identity_blocks, - identity_threads, - 0, - stream, - n, - p); + hipLaunchKernelGGL((identity_kernel), identity_blocks, identity_threads, 0, stream, n, p); #undef IDENTITY_DIM return rocsparse_status_success; } From c499328cb9b0f534db8b36f302aac5c609ca5a1f Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 5 Jun 2018 09:42:14 +0200 Subject: [PATCH 124/304] sym links were missing packages --- library/CMakeLists.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index ea9e6847..8776bfd1 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -73,6 +73,9 @@ rocm_export_targets(TARGETS rocsparse-targets NAMESPACE roc:: ) +# Symbolic links +rocm_install_symlink_subdir(rocsparse) + # Package specific CPACK vars set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc (>= 1.3)") set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc >= 1.3") @@ -84,8 +87,14 @@ endif() set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" "\${CPACK_PACKAGING_INSTALL_PREFIX}/include") -# Package name -set(package_name rocsparse) +# Package name differs for CUDA backend +if(HIP_PLATFORM STREQUAL "hcc") + set(package_name rocsparse) +endif() + +if(HIP_PLATFORM STREQUAL "nvcc") + set(package_name rocsparse-alt) +endif() set(ROCSPARSE_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file") From 0a35e8379f3e2380841cc47054f66bf06a6d2735 Mon Sep 17 00:00:00 2001 From: Nico <31079890+ntrost57@users.noreply.github.com> Date: Wed, 6 Jun 2018 12:04:54 +0200 Subject: [PATCH 125/304] removed clang format check for some reason, clang format check fails on some CI machines --- Jenkinsfile | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 69b51165..6009f66a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -241,19 +241,19 @@ def docker_build_inside_image( def build_image, compiler_data compiler_args, doc """ archiveArtifacts artifacts: "${docker_context}/*.deb", fingerprint: true - stage('Clang Format') - { - sh ''' - find . -iname \'*.h\' \ - -o -iname \'*.hpp\' \ - -o -iname \'*.cpp\' \ - -o -iname \'*.h.in\' \ - -o -iname \'*.hpp.in\' \ - -o -iname \'*.cpp.in\' \ - | grep -v 'build/' \ - | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' - ''' - } +// stage('Clang Format') +// { +// sh ''' +// find . -iname \'*.h\' \ +// -o -iname \'*.hpp\' \ +// -o -iname \'*.cpp\' \ +// -o -iname \'*.h.in\' \ +// -o -iname \'*.hpp.in\' \ +// -o -iname \'*.cpp.in\' \ +// | grep -v 'build/' \ +// | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-3.8 -style=file {} | diff - {}\' +// ''' +// } } else if( paths.project_name.equalsIgnoreCase( 'rocsparse-fedora' ) ) { From 3b9931f1be1f6504215c58a8c2a680a3b2220372 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 15 Jun 2018 17:23:42 +0200 Subject: [PATCH 126/304] csrmm prep --- .../rocsparse_template_specialization.cpp | 42 ++ clients/include/rocsparse.hpp | 18 + clients/include/testing_csrmm.hpp | 391 ++++++++++++++++++ clients/tests/CMakeLists.txt | 1 + clients/tests/test_csrmm.cpp | 68 +++ library/include/rocsparse-functions.h | 119 ++++++ library/src/CMakeLists.txt | 3 + library/src/level3/csrmm_device.h | 12 + library/src/level3/rocsparse_csrmm.cpp | 54 +++ library/src/level3/rocsparse_csrmm.hpp | 49 +++ 10 files changed, 757 insertions(+) create mode 100644 clients/include/testing_csrmm.hpp create mode 100644 clients/tests/test_csrmm.cpp create mode 100644 library/src/level3/csrmm_device.h create mode 100644 library/src/level3/rocsparse_csrmm.cpp create mode 100644 library/src/level3/rocsparse_csrmm.hpp diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 3147d7b7..6b552165 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -286,6 +286,48 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, return rocsparse_dhybmv(handle, trans, alpha, descr, hyb, x, beta, y); } +template <> +rocsparse_status rocsparse_csrmm(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const float* B, + rocsparse_int ldb, + const float* beta, + float* C, + rocsparse_int ldc) +{ + return rocsparse_scsrmm(handle, trans, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); +} + +template <> +rocsparse_status rocsparse_csrmm(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const double* B, + rocsparse_int ldb, + const double* beta, + double* C, + rocsparse_int ldc) +{ + return rocsparse_dcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); +} + template <> rocsparse_status rocsparse_csr2ell(rocsparse_handle handle, rocsparse_int m, diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index c24279c8..7b3485b2 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -115,6 +115,24 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, const T* beta, T* y); +template +rocsparse_status rocsparse_csrmm(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* B, + rocsparse_int ldb, + const T* beta, + T* C, + rocsparse_int ldc); + template rocsparse_status rocsparse_csr2ell(rocsparse_handle handle, rocsparse_int m, diff --git a/clients/include/testing_csrmm.hpp b/clients/include/testing_csrmm.hpp new file mode 100644 index 00000000..c5c8a85e --- /dev/null +++ b/clients/include/testing_csrmm.hpp @@ -0,0 +1,391 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_CSRMM_HPP +#define TESTING_CSRMM_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_csrmm_bad_arg(void) +{ + + rocsparse_int n = 100; + rocsparse_int m = 100; + rocsparse_int k = 100; + rocsparse_int ldb = 100; + rocsparse_int ldc = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + T alpha = 0.6; + T beta = 0.2; + rocsparse_operation trans = rocsparse_operation_none; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dB_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dC_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dB = (T*)dB_managed.get(); + T* dC = (T*)dC_managed.get(); + + if(!dval || !dptr || !dcol || !dB || !dC) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing for(nullptr == dptr) + { + rocsparse_int* dptr_null = nullptr; + + status = rocsparse_csrmm( + handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr_null, dcol, dB, ldb, &beta, dC, ldc); + verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); + } + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_csrmm( + handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol_null, dB, ldb, &beta, dC, ldc); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dval) + { + T* dval_null = nullptr; + + status = rocsparse_csrmm( + handle, trans, m, n, k, nnz, &alpha, descr, dval_null, dptr, dcol, dB, ldb, &beta, dC, ldc); + verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); + } + // testing for(nullptr == dB) + { + T* dB_null = nullptr; + + status = rocsparse_csrmm( + handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol, dB_null, ldb, &beta, dC, ldc); + verify_rocsparse_status_invalid_pointer(status, "Error: dB is nullptr"); + } + // testing for(nullptr == dC) + { + T* dC_null = nullptr; + + status = rocsparse_csrmm( + handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, &beta, dC_null, ldc); + verify_rocsparse_status_invalid_pointer(status, "Error: dC is nullptr"); + } + // testing for(nullptr == d_alpha) + { + T* d_alpha_null = nullptr; + + status = rocsparse_csrmm( + handle, trans, m, n, k, nnz, d_alpha_null, descr, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); + verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); + } + // testing for(nullptr == d_beta) + { + T* d_beta_null = nullptr; + + status = rocsparse_csrmm( + handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, d_beta_null, dC, ldc); + verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_csrmm( + handle, trans, m, n, k, nnz, &alpha, descr_null, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrmm( + handle_null, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_csrmm(Arguments argus) +{ +/* + rocsparse_int safe_size = 100; + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + T h_alpha = argus.alpha; + T h_beta = argus.beta; + rocsparse_operation trans = argus.trans; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + std::unique_ptr test_descr(new descr_struct); + rocsparse_mat_descr descr = test_descr->descr; + + // Set matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); + + // Determine number of non-zero elements + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + + if(!dval || !dptr || !dcol || !dx || !dy) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dptr || !dcol || !dval || !dx || !dy"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + status = rocsparse_csrmm( + handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hcsr_row_ptr; + std::vector hcoo_row_ind; + std::vector hcol_ind; + std::vector hval; + + // Initial Data on CPU + srand(12345ULL); + if(argus.laplacian) + { + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcol_ind, hval, idx_base); + nnz = hcsr_row_ptr[m]; + } + else + { + if(argus.filename != "") + { + if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval) != + 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base); + } + + // Convert COO to CSR + if(!argus.laplacian) + { + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } + } + } + + std::vector hx(n); + std::vector hy_1(m); + std::vector hy_2(m); + std::vector hy_gold(m); + + rocsparse_init(hx, 1, n); + rocsparse_init(hy_1, 1, m); + + // copy vector is easy in STL; hy_gold = hx: save a copy in hy_gold which will be output of CPU + hy_2 = hy_1; + hy_gold = hy_1; + + // allocate memory on device + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * n), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto d_beta_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy_1 = (T*)dy_1_managed.get(); + T* dy_2 = (T*)dy_2_managed.get(); + T* d_alpha = (T*)d_alpha_managed.get(); + T* d_beta = (T*)d_beta_managed.get(); + + if(!dval || !dptr || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dval || !dptr || !dcol || !dx || " + "!dy_1 || !dy_2 || !d_alpha || !d_beta"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy( + dptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dcol, hcol_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dval, hval.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * n, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrmm( + handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1)); + + // ROCSPARSE pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrmm( + handle, trans, m, n, nnz, d_alpha, descr, dval, dptr, dcol, dx, d_beta, dy_2)); + + // copy output from device to CPU + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * m, hipMemcpyDeviceToHost)); + + // CPU + double cpu_time_used = get_time_us(); + + for(rocsparse_int i = 0; i < m; ++i) + { + hy_gold[i] *= h_beta; + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; + ++j) + { + hy_gold[i] += h_alpha * hval[j] * hx[hcol_ind[j] - idx_base]; + } + } + + cpu_time_used = get_time_us() - cpu_time_used; + + // enable unit check, notice unit check is not invasive, but norm check is, + // unit check and norm check can not be interchanged their order + if(argus.unit_check) + { + unit_check_general(1, m, hy_gold.data(), hy_1.data()); + unit_check_general(1, m, hy_gold.data(), hy_2.data()); + } + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_csrmm( + handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_csrmm( + handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); + } + + // Convert to miliseconds per call + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + size_t flops = (h_alpha != 1.0) ? 3.0 * nnz : 2.0 * nnz; + flops = (h_beta != 0.0) ? flops + m : flops; + double gpu_gflops = flops / gpu_time_used / 1e6; + size_t memtrans = 2.0 * m + nnz; + memtrans = (h_beta != 0.0) ? memtrans + m : memtrans; + double bandwidth = + (memtrans * sizeof(T) + (m + 1 + nnz) * sizeof(rocsparse_int)) / gpu_time_used / 1e6; + + printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + m, + n, + nnz, + h_alpha, + h_beta, + gpu_gflops, + bandwidth, + gpu_time_used); + } +*/ + return rocsparse_status_success; +} + +#endif // TESTING_CSRMM_HPP diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 83dd4f7b..3b267834 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -16,6 +16,7 @@ set(ROCSPARSE_TEST_SOURCES test_csrmv.cpp test_ellmv.cpp test_hybmv.cpp + test_csrmm.cpp test_csr2coo.cpp test_csr2ell.cpp test_csr2hyb.cpp diff --git a/clients/tests/test_csrmm.cpp b/clients/tests/test_csrmm.cpp new file mode 100644 index 00000000..bdb87a1d --- /dev/null +++ b/clients/tests/test_csrmm.cpp @@ -0,0 +1,68 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_csrmm.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple csrmm_tuple; + +int csrmm_M_range[] = {-1, 0, 10, 500, 7111, 10000}; +int csrmm_N_range[] = {-3, 0, 33, 842, 4441, 10000}; + +std::vector csrmm_alpha_range = {2.0, 3.0}; +std::vector csrmm_beta_range = {0.0, 1.0}; + +base csrmm_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +class parameterized_csrmm : public testing::TestWithParam +{ + protected: + parameterized_csrmm() {} + virtual ~parameterized_csrmm() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csrmm_arguments(csrmm_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.alpha = std::get<2>(tup); + arg.beta = std::get<3>(tup); + arg.idx_base = std::get<4>(tup); + arg.timing = 0; + return arg; +} + +TEST(csrmm_bad_arg, csrmm_float) { testing_csrmm_bad_arg(); } + +TEST_P(parameterized_csrmm, csrmm_float) +{ + Arguments arg = setup_csrmm_arguments(GetParam()); + + rocsparse_status status = testing_csrmm(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrmm, csrmm_double) +{ + Arguments arg = setup_csrmm_arguments(GetParam()); + + rocsparse_status status = testing_csrmm(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csrmm, + parameterized_csrmm, + testing::Combine(testing::ValuesIn(csrmm_M_range), + testing::ValuesIn(csrmm_N_range), + testing::ValuesIn(csrmm_alpha_range), + testing::ValuesIn(csrmm_beta_range), + testing::ValuesIn(csrmm_idxbase_range))); diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 7a970d1c..f590708a 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -805,6 +805,125 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, * =========================================================================== */ +/*! \brief SPARSE Level 3 API + + \details + csrmm multiplies the sparse m x k CSR matrix A with dense matrix B and + stores the result in dense matrix C such that + + C := alpha * op(A) * B + beta * C + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + trans operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of B and C. + @param[in] + k number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + alpha scalar alpha. + @param[in] + descr descriptor of A. + @param[in] + csr_val array of nnz elements of A. + @param[in] + csr_row_ptr array of m+1 elements that point to the start of every row + of A. + @param[in] + csr_col_ind array of nnz elements containing the column indices of A. + @param[in] + B array of dimension (ldb, n) containing the elements of B. + @param[in] + ldb leading dimension of B, must be at least max(1, k) if + op(A) == A or max(1, m) otherwise. + @param[in] + beta scalar beta. + @param[inout] + C array of dimension (ldc, n) containing the elements of C. + @param[in] + ldc leading dimension of C, must be at least max(1, m) if + op(A) == A or max(1, k) otherwise. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsrmm(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const float* B, + rocsparse_int ldb, + const float* beta, + float* C, + rocsparse_int ldc); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsrmm(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const double* B, + rocsparse_int ldb, + const double* beta, + double* C, + rocsparse_int ldc); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_ccsrmm(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const rocsparse_float_complex* alpha, + const rocsparse_mat_descr descr, + const rocsparse_float_complex* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_float_complex* B, + rocsparse_int ldb, + const rocsparse_float_complex* beta, + rocsparse_float_complex* C, + rocsparse_int ldc); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zcsrmm(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const rocsparse_double_complex* alpha, + const rocsparse_mat_descr descr, + const rocsparse_double_complex* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const rocsparse_double_complex* B, + rocsparse_int ldb, + const rocsparse_double_complex* beta, + rocsparse_double_complex* C, + rocsparse_int ldc); +*/ + /* * =========================================================================== * Sparse Format Conversions diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index c12f74a0..0108a65a 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -23,6 +23,9 @@ set(rocsparse_source src/level2/rocsparse_ellmv.cpp src/level2/rocsparse_hybmv.cpp +# Level3 + src/level3/rocsparse_csrmm.cpp + # Conversion src/conversion/rocsparse_csr2coo.cpp src/conversion/rocsparse_csr2ell.cpp diff --git a/library/src/level3/csrmm_device.h b/library/src/level3/csrmm_device.h new file mode 100644 index 00000000..1f3545ca --- /dev/null +++ b/library/src/level3/csrmm_device.h @@ -0,0 +1,12 @@ +#pragma once +#ifndef CSRMM_DEVICE_H +#define CSRMM_DEVICE_H + +#include + +template +static __device__ void csrmmn_general_device() +{ +} + +#endif // CSRMM_DEVICE_H diff --git a/library/src/level3/rocsparse_csrmm.cpp b/library/src/level3/rocsparse_csrmm.cpp new file mode 100644 index 00000000..df780cc5 --- /dev/null +++ b/library/src/level3/rocsparse_csrmm.cpp @@ -0,0 +1,54 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_csrmm.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_scsrmm(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const float* B, + rocsparse_int ldb, + const float* beta, + float* C, + rocsparse_int ldc) +{ + return rocsparse_csrmm_template( + handle, trans, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); +} + +extern "C" rocsparse_status rocsparse_dcsrmm(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const double* B, + rocsparse_int ldb, + const double* beta, + double* C, + rocsparse_int ldc) +{ + return rocsparse_csrmm_template( + handle, trans, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); +} diff --git a/library/src/level3/rocsparse_csrmm.hpp b/library/src/level3/rocsparse_csrmm.hpp new file mode 100644 index 00000000..1416ff33 --- /dev/null +++ b/library/src/level3/rocsparse_csrmm.hpp @@ -0,0 +1,49 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_CSRMM_HPP +#define ROCSPARSE_CSRMM_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "csrmm_device.h" + +#include + +template +__global__ void csrmmn_kernel_host_pointer() +{ + csrmmn_general_device(); +} + +template +__global__ void csrmmn_kernel_device_pointer() +{ + csrmmn_general_device(); +} + +template +rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* B, + rocsparse_int ldb, + const T* beta, + T* C, + rocsparse_int ldc) +{ + return rocsparse_status_not_implemented; +} + +#endif // ROCSPARSE_CSRMM_HPP From 82ed7d85a365666a9e7d0cb508d3ac098ab59aa8 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 15 Jun 2018 22:46:38 +0200 Subject: [PATCH 127/304] csrmm prep #2 --- library/src/level3/csrmm_device.h | 15 +- library/src/level3/rocsparse_csrmm.hpp | 203 ++++++++++++++++++++++++- 2 files changed, 212 insertions(+), 6 deletions(-) diff --git a/library/src/level3/csrmm_device.h b/library/src/level3/csrmm_device.h index 1f3545ca..26382766 100644 --- a/library/src/level3/csrmm_device.h +++ b/library/src/level3/csrmm_device.h @@ -5,7 +5,20 @@ #include template -static __device__ void csrmmn_general_device() +static __device__ void csrmmn_general_device(rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + T alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* B, + rocsparse_int ldb, + T beta, + T* C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { } diff --git a/library/src/level3/rocsparse_csrmm.hpp b/library/src/level3/rocsparse_csrmm.hpp index 1416ff33..7df7797b 100644 --- a/library/src/level3/rocsparse_csrmm.hpp +++ b/library/src/level3/rocsparse_csrmm.hpp @@ -14,15 +14,41 @@ #include template -__global__ void csrmmn_kernel_host_pointer() +__global__ void csrmmn_kernel_host_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + T alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* B, + rocsparse_int ldb, + T beta, + T* C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { - csrmmn_general_device(); + csrmmn_general_device(m, n, k, nnz, alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, beta, C, ldc, idx_base); } template -__global__ void csrmmn_kernel_device_pointer() +__global__ void csrmmn_kernel_device_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const T* alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* B, + rocsparse_int ldb, + const T* beta, + T* C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { - csrmmn_general_device(); + csrmmn_general_device(m, n, k, nnz, *alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, *beta, C, ldc, idx_base); } template @@ -43,7 +69,174 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, T* C, rocsparse_int ldc) { - return rocsparse_status_not_implemented; + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xcsrmm"), + trans, + m, + n, + k, + nnz, + *alpha, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)B, + ldb, + *beta, + (const void*&)C, + ldc); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xcsrmm"), + trans, + m, + n, + k, + nnz, + (const void*&)alpha, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)B, + ldb, + (const void*&)beta, + (const void*&)C, + ldc); + } + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(k < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check leading dimensions + if(trans == rocsparse_operation_none) + { + if(ldb < std::max(1, k)) + { + return rocsparse_status_invalid_size; + } + else if(ldc < std::max(1, m)) + { + return rocsparse_status_invalid_size; + } + } + else + { + if(ldb < std::max(1, m)) + { + return rocsparse_status_invalid_size; + } + else if(ldc < std::max(1, k)) + { + return rocsparse_status_invalid_size; + } + } + + // Check pointer arguments + if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(B == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(C == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(beta == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || k == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // Run different csrmv kernels + if(trans == rocsparse_operation_none) + { +#define CSRMMN_DIM 512 + dim3 csrmmn_blocks((m - 1) / CSRMMN_DIM + 1); + dim3 csrmmn_threads(CSRMMN_DIM); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + } + else + { + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + } +#undef CSRMM_DIM + } + else + { + return rocsparse_status_not_implemented; + } + return rocsparse_status_success; } #endif // ROCSPARSE_CSRMM_HPP From e2e62c7d023415ecac18cad31a61f372ac47599e Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 22 Jun 2018 13:30:49 +0200 Subject: [PATCH 128/304] adding rocprim to the build chain --- cmake/Dependencies.cmake | 32 +++++++++++++++++++------------- library/CMakeLists.txt | 6 ++++++ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index e80faf72..dbd48079 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -105,19 +105,25 @@ if(BUILD_BENCHMARK) endif() # rocPRIM package -#set(ROCPRIM_ROOT ${CMAKE_CURRENT_BINARY_DIR}/rocPRIM CACHE PATH "") -#message(STATUS "Downloading rocPRIM.") -#download_project(PROJ rocPRIM -# GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git -# GIT_TAG master -# INSTALL_DIR ${ROCPRIM_ROOT} -# CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= -# LOG_DOWNLOAD TRUE -# LOG_CONFIGURE TRUE -# LOG_INSTALL TRUE -# BUILD_PROJECT TRUE -# UPDATE_DISCONNECT TRUE -#) +if(HIP_PLATFORM STREQUAL "hcc") + find_package(ROCPRIM QUIET CONFIG PATHS /opt/rocm) + if(NOT ROCPRIM_FOUND) + set(ROCPRIM_ROOT ${CMAKE_CURRENT_BINARY_DIR}/rocPRIM CACHE PATH "") + message(STATUS "Downloading rocPRIM.") + download_project(PROJ rocPRIM + GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git + GIT_TAG master + INSTALL_DIR ${ROCPRIM_ROOT} + CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= + LOG_DOWNLOAD TRUE + LOG_CONFIGURE TRUE + LOG_INSTALL TRUE + BUILD_PROJECT TRUE + UPDATE_DISCONNECT TRUE + ) + find_package(ROCPRIM REQUIRED CONFIG PATHS ${ROCPRIM_ROOT}) + endif() +endif() # ROCm package find_package(ROCM QUIET CONFIG PATHS /opt/rocm) diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index 8776bfd1..572db6ff 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -41,6 +41,7 @@ endif() # Target include directories target_include_directories(rocsparse PRIVATE $ + $ PUBLIC $ $ $ @@ -54,6 +55,11 @@ endif() set_target_properties(rocsparse PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging") set_target_properties(rocsparse PROPERTIES DEBUG_POSTFIX "-d") +# Target definitions +if(HIP_PLATFORM STREQUAL "hcc") + target_compile_definitions(rocsparse PRIVATE ROCPRIM_HIP_API=1) +endif() + # Generate export header include(GenerateExportHeader) generate_export_header(rocsparse EXPORT_FILE_NAME ${PROJECT_BINARY_DIR}/include/rocsparse-export.h) From 3d3f100e080c2c18d44fde4d2e0f9bb8860fbf01 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 22 Jun 2018 14:02:26 +0200 Subject: [PATCH 129/304] hipcub/cub for nvcc target --- cmake/Dependencies.cmake | 18 ++++++++++++++++++ library/CMakeLists.txt | 1 + 2 files changed, 19 insertions(+) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index dbd48079..eadc9528 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -123,6 +123,24 @@ if(HIP_PLATFORM STREQUAL "hcc") ) find_package(ROCPRIM REQUIRED CONFIG PATHS ${ROCPRIM_ROOT}) endif() +elseif(HIP_PLATFORM STREQUAL "nvcc") + find_package(HIPCUB QUIET CONFIG PATHS /opt/rocm) + if(NOT HIPCUB_FOUND) + set(ROCPRIM_ROOT ${CMAKE_CURRENT_BINARY_DIR}/rocPRIM CACHE PATH "") + message(STATUS "Downloading rocPRIM.") + download_project(PROJ rocPRIM + GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git + GIT_TAG master + INSTALL_DIR ${ROCPRIM_ROOT} + CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= + LOG_DOWNLOAD TRUE + LOG_CONFIGURE TRUE + LOG_INSTALL TRUE + BUILD_PROJECT TRUE + UPDATE_DISCONNECT TRUE + ) + find_package(HIPCUB REQUIRED CONFIG PATHS ${ROCPRIM_ROOT}) + endif() endif() # ROCm package diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index 572db6ff..48d3f916 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -42,6 +42,7 @@ endif() target_include_directories(rocsparse PRIVATE $ $ + $ PUBLIC $ $ $ From ad0537d1a7d2e2e49c63307f2084733020cc8b86 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 22 Jun 2018 15:33:15 +0200 Subject: [PATCH 130/304] csrsort() added --- library/include/rocsparse-functions.h | 83 ++++++++- library/src/CMakeLists.txt | 1 + library/src/conversion/rocsparse_csrsort.cpp | 179 +++++++++++++++++++ 3 files changed, 261 insertions(+), 2 deletions(-) create mode 100644 library/src/conversion/rocsparse_csrsort.cpp diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 7a970d1c..828d8c1e 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -1082,15 +1082,94 @@ rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, handle rocsparse_handle. handle to the rocsparse library context queue. @param[in] - n size of the map p + n size of the map p. @param[out] - p array of n integers containing the map + p array of n integers containing the map. ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_create_identity_permutation(rocsparse_handle handle, rocsparse_int n, rocsparse_int* p); +/*! \brief SPARSE Format Conversions API + + \details + csrsort_buffer_size returns the size of the temporary storage buffer + that is required by csrsort. The temporary storage buffer has to be + allocated by the user. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero elements of A. + @param[in] + csr_row_ptr array of m+1 elements that point to the start + of every row of A. + @param[in] + csr_col_ind array of nnz elements containing the column indices + of A. + @param[out] + buffer_size number of bytes of the temporary storage buffer. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + size_t* buffer_size); + +/*! \brief SPARSE Format Conversions API + + \details + csrsort sorts a matrix in CSR format in-place. csrsort requires a + temporary storage buffer. The sorted permutation vector perm can be + used to obtain sorted csr_val array. In this case, P must be + initialized as the identity permutation 0:1:(nnz-1). + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero elements of A. + @param[in] + descr descriptor of A. + @param[in] + csr_row_ptr array of m+1 elements that point to the start + of every row of A. + @param[inout] + csr_col_ind array of nnz elements containing the column indices + of A. + @param[inout] + perm array of nnz integers containing the unsorted map + indices. + @param[in] + temp_buffer temporary storage buffer allocated by the user, + size is returned by csrsort_buffer_size + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrsort(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind, + rocsparse_int* perm, + void* temp_buffer); + #ifdef __cplusplus } #endif diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index c12f74a0..508b0ae9 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -29,4 +29,5 @@ set(rocsparse_source src/conversion/rocsparse_csr2hyb.cpp src/conversion/rocsparse_coo2csr.cpp src/conversion/rocsparse_identity.cpp + src/conversion/rocsparse_csrsort.cpp ) diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp new file mode 100644 index 00000000..659c1281 --- /dev/null +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -0,0 +1,179 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" + +#include + +#if defined(__HIP_PLATFORM_HCC__) +#include +#elif defined(__HIP_PLATFORM_NVCC__) +#include +#endif + +extern "C" rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + size_t* buffer_size) +{ + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging TODO bench logging + log_trace(handle, "rocsparse_csrsort_buffer_size", + m, + n, + nnz, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)buffer_size); + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(buffer_size == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + *buffer_size = 0; + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + rocsparse_int* null_ptr = nullptr; + +// TODO config required for buffer?? +#if defined(__HIP_PLATFORM_HCC__) + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1> >; + + rocprim::segmented_radix_sort_pairs(nullptr, *buffer_size, null_ptr, null_ptr, null_ptr, null_ptr, nnz, m, null_ptr, null_ptr, 0, 32, stream); +#elif defined(__HIP_PLATFORM_NVCC__) + hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, *buffer_size, null_ptr, null_ptr, null_ptr, null_ptr, nnz, m, null_ptr, null_ptr, 0, 32, stream); +#endif + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind, + rocsparse_int* perm, + void* temp_buffer) +{ + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + log_trace(handle, "rocsparse_csrsort_buffer_size", + m, + n, + nnz, + (const void*&)descr, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)perm, + (const void*&)temp_buffer); + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(perm == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(temp_buffer == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + unsigned int startbit = 0; + unsigned int endbit = 32 - __builtin_clz(n); + size_t size; + +#if defined(__HIP_PLATFORM_HCC__) + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1> >; + + rocprim::segmented_radix_sort_pairs(nullptr, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); +#elif defined(__HIP_PLATFORM_NVCC__) + hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + hipcub::DeviceSegmentedRadixSort::SortPairs(temp_buffer, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); +#endif + + return rocsparse_status_success; +} From 178e4e0fbcea9d61f8abc67e66ab237969ac5555 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 22 Jun 2018 15:34:07 +0200 Subject: [PATCH 131/304] tests for csrsort --- clients/common/unit.cpp | 16 ++ clients/include/testing_csrsort.hpp | 344 ++++++++++++++++++++++++++++ clients/tests/CMakeLists.txt | 1 + clients/tests/test_csrsort.cpp | 48 ++++ 4 files changed, 409 insertions(+) create mode 100644 clients/include/testing_csrsort.hpp create mode 100644 clients/tests/test_csrsort.cpp diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index ee349418..8eba8eaf 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -67,3 +67,19 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int* hCPU, r } } } + +template <> +void unit_check_general(rocsparse_int M, rocsparse_int N, size_t* hCPU, size_t* hGPU) +{ + for(rocsparse_int j = 0; j < N; j++) + { + for(rocsparse_int i = 0; i < M; i++) + { +#ifdef GOOGLE_TEST + ASSERT_EQ(hCPU[i + j], hGPU[i + j]); +#else + assert(hCPU[i + j] == hGPU[i + j]); +#endif + } + } +} diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp new file mode 100644 index 00000000..e2a6f486 --- /dev/null +++ b/clients/include/testing_csrsort.hpp @@ -0,0 +1,344 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_CSRSORT_HPP +#define TESTING_CSRSORT_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +void testing_csrsort_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int n = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + size_t buffer_size = 0; + + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto perm_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto buffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + rocsparse_int* perm = (rocsparse_int*)perm_managed.get(); + void* buffer = (void*)buffer_managed.get(); + + if(!csr_row_ptr || !csr_col_ind || !perm || !buffer) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // Testing csrsort_buffer_size for bad args + + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_csrsort_buffer_size(handle, m, n, nnz, csr_row_ptr_null, csr_col_ind, &buffer_size); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + + // Testing for (csr_col_ind == nullptr) + { + rocsparse_int* csr_col_ind_null = nullptr; + + status = rocsparse_csrsort_buffer_size(handle, m, n, nnz, csr_row_ptr, csr_col_ind_null, &buffer_size); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); + } + + // Testing for (buffer_size == nullptr) + { + size_t* buffer_size_null = nullptr; + + status = rocsparse_csrsort_buffer_size(handle, m, n, nnz, csr_row_ptr, csr_col_ind, buffer_size_null); + verify_rocsparse_status_invalid_pointer(status, "Error: buffer_size is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrsort_buffer_size(handle_null, m, n, nnz, csr_row_ptr, csr_col_ind, &buffer_size); + verify_rocsparse_status_invalid_handle(status); + } + + // Testing csrsort for bad args + + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr_null, csr_col_ind, perm, buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + + // Testing for (csr_col_ind == nullptr) + { + rocsparse_int* csr_col_ind_null = nullptr; + + status = rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind_null, perm, buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); + } + + // Testing for (perm == nullptr) + { + rocsparse_int* perm_null = nullptr; + + status = rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm_null, buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: perm is nullptr"); + } + + // Testing for (buffer == nullptr) + { + rocsparse_int* buffer_null = nullptr; + + status = rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm, buffer_null); + verify_rocsparse_status_invalid_pointer(status, "Error: buffer is nullptr"); + } + + // Testing for (descr == nullptr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_csrsort(handle, m, n, nnz, descr_null, csr_row_ptr, csr_col_ind, perm, buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrsort(handle_null, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm, buffer); + verify_rocsparse_status_invalid_handle(status); + } +} + +rocsparse_status testing_csrsort(Arguments argus) +{ + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_status status; + + size_t buffer_size = 0; + + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + + if(!csr_row_ptr || !csr_col_ind) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!csr_row_ptr || !csr_col_ind"); + return rocsparse_status_memory_error; + } + + status = rocsparse_csrsort_buffer_size(handle, m, n, nnz, csr_row_ptr, csr_col_ind, &buffer_size); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + + // Buffer size should be zero + size_t zero = 0; + unit_check_general(1, 1, &zero, &buffer_size); + } + + return rocsparse_status_success; + } + + // For testing, assemble a COO matrix and convert it to CSR first (on host) + + // Host structures + std::vector hcsr_row_ptr(m + 1, 0); + std::vector hcoo_row_ind(nnz); + std::vector hcsr_col_ind(nnz); + std::vector hcsr_val(nnz); + + // Sample initial COO matrix on CPU + srand(12345ULL); + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, rocsparse_index_base_zero); + + // Convert COO to CSR + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1]; + } + + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } + + // Unsort CSR columns + std::vector hperm(nnz); + std::vector hcsr_col_ind_unsorted(nnz); + std::vector hcsr_val_unsorted(nnz); + + hcsr_col_ind_unsorted = hcsr_col_ind; + hcsr_val_unsorted = hcsr_val; + + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int row_begin = hcsr_row_ptr[i]; + rocsparse_int row_end = hcsr_row_ptr[i + 1]; + rocsparse_int row_nnz = row_end - row_begin; + + for(rocsparse_int j = row_begin; j < row_end; ++j) + { + rocsparse_int rng = row_begin + rand() % row_nnz; + + rocsparse_int temp_col = hcsr_col_ind_unsorted[j]; + float temp_val = hcsr_val_unsorted[j]; + + hcsr_col_ind_unsorted[j] = hcsr_col_ind_unsorted[rng]; + hcsr_val_unsorted[j] = hcsr_val_unsorted[rng]; + + hcsr_col_ind_unsorted[rng] = temp_col; + hcsr_val_unsorted[rng] = temp_val; + } + } + + // Allocate memory on the device + auto dcsr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcsr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcsr_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(float) * nnz), device_free}; + auto dcsr_val_sorted_managed = + rocsparse_unique_ptr{device_malloc(sizeof(float) * nnz), device_free}; + auto dperm_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + + rocsparse_int* dcsr_row_ptr = (rocsparse_int*)dcsr_row_ptr_managed.get(); + rocsparse_int* dcsr_col_ind = (rocsparse_int*)dcsr_col_ind_managed.get(); + float* dcsr_val = (float*)dcsr_val_managed.get(); + float* dcsr_val_sorted = (float*)dcsr_val_sorted_managed.get(); + rocsparse_int* dperm = (rocsparse_int*)dperm_managed.get(); + + if(!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val || !dcsr_val_sorted || !dperm) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dcsr_row_ptr || !dcsr_col_ind || " + "!dcsr_val || !dcsr_val_sorted || !dperm"); + return rocsparse_status_memory_error; + } + + // Copy data from host to device + CHECK_HIP_ERROR(hipMemcpy(dcsr_row_ptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_col_ind, hcsr_col_ind_unsorted.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_val, hcsr_val_unsorted.data(), sizeof(float) * nnz, hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + // Obtain buffer size + CHECK_ROCSPARSE_ERROR(rocsparse_csrsort_buffer_size(handle, m, n, nnz, dcsr_row_ptr, dcsr_col_ind, &buffer_size)); + + // Allocate buffer on the device + auto dbuffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * buffer_size), device_free}; + + void* dbuffer = (void*)dbuffer_managed.get(); + + if(!dbuffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dbuffer"); + return rocsparse_status_memory_error; + } + + // Initialize perm with identity permutation + CHECK_ROCSPARSE_ERROR(rocsparse_create_identity_permutation(handle, nnz, dperm)); + + // Sort CSR columns + CHECK_ROCSPARSE_ERROR(rocsparse_csrsort(handle, m, n, nnz, descr, dcsr_row_ptr, dcsr_col_ind, dperm, dbuffer)); + + // Sort CSR values + CHECK_ROCSPARSE_ERROR(rocsparse_sgthr(handle, nnz, dcsr_val, dcsr_val_sorted, dperm, rocsparse_index_base_zero)); + + // Copy output from device to host + CHECK_HIP_ERROR(hipMemcpy(hcsr_col_ind_unsorted.data(), dcsr_col_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcsr_val_unsorted.data(), dcsr_val_sorted, sizeof(float) * nnz, hipMemcpyDeviceToHost)); + + // Unit check + unit_check_general(1, nnz, hcsr_col_ind.data(), hcsr_col_ind_unsorted.data()); + unit_check_general(1, nnz, hcsr_val.data(), hcsr_val_unsorted.data()); + } + + if(argus.timing) + { +/* TODO + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; + + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_csrsort(handle, dcsr_row_ptr, nnz, m, dcoo_row_ind, idx_base); + } + + double gpu_time_used = get_time_us(); + + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_csrsort(handle, dcsr_row_ptr, nnz, m, dcoo_row_ind, idx_base); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + + double bandwidth = sizeof(rocsparse_int) * (nnz + m + 1) / gpu_time_used / 1e6; + + printf("m\t\tn\t\tnnz\t\tGB/s\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\n", m, n, nnz, bandwidth, gpu_time_used); +*/ + } + return rocsparse_status_success; +} + +#endif // TESTING_CSRSORT_HPP diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 83dd4f7b..0304b631 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -21,6 +21,7 @@ set(ROCSPARSE_TEST_SOURCES test_csr2hyb.cpp test_coo2csr.cpp test_identity.cpp + test_csrsort.cpp ) set(ROCSPARSE_CLIENTS_COMMON diff --git a/clients/tests/test_csrsort.cpp b/clients/tests/test_csrsort.cpp new file mode 100644 index 00000000..86ab49c8 --- /dev/null +++ b/clients/tests/test_csrsort.cpp @@ -0,0 +1,48 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_csrsort.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef std::tuple csrsort_tuple; + +int csrsort_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int csrsort_N_range[] = {-3, 0, 33, 242, 623, 1000}; + +class parameterized_csrsort : public testing::TestWithParam +{ + protected: + parameterized_csrsort() {} + virtual ~parameterized_csrsort() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csrsort_arguments(csrsort_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.timing = 0; + return arg; +} + +TEST(csrsort_bad_arg, csrsort) { testing_csrsort_bad_arg(); } + +TEST_P(parameterized_csrsort, csrsort) +{ + Arguments arg = setup_csrsort_arguments(GetParam()); + + rocsparse_status status = testing_csrsort(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csrsort, + parameterized_csrsort, + testing::Combine(testing::ValuesIn(csrsort_M_range), + testing::ValuesIn(csrsort_N_range))); From 9c41a42051aed956c69f7e80e53b385749d0a0ae Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 25 Jun 2018 11:14:39 +0200 Subject: [PATCH 132/304] function to obtain position of left most significant bit --- library/src/include/utility.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/library/src/include/utility.h b/library/src/include/utility.h index 3977fb08..d2c1ab29 100644 --- a/library/src/include/utility.h +++ b/library/src/include/utility.h @@ -14,6 +14,19 @@ #include #include +// Return the leftmost significant bit position +#if defined(rocsparse_ILP64) +static inline rocsparse_int rocsparse_clz(rocsparse_int n) +{ + return 64 - __builtin_clzll(n); +} +#else +static inline rocsparse_int rocsparse_clz(rocsparse_int n) +{ + return 32 - __builtin_clz(n); +} +#endif + // if trace logging is turned on with // (handle->layer_mode & rocsparse_layer_mode_log_trace) == true // then From ce203228cf46a97f6dcdb9203299ed35f6b3ab68 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 25 Jun 2018 11:14:49 +0200 Subject: [PATCH 133/304] updated csrsort() --- library/src/conversion/rocsparse_csrsort.cpp | 68 ++++++++++++++++++-- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 659c1281..4b4d0b0c 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -86,6 +86,14 @@ extern "C" rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handl hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, *buffer_size, null_ptr, null_ptr, null_ptr, null_ptr, nnz, m, null_ptr, null_ptr, 0, 32, stream); #endif + // rocPRIM does not support in-place sorting, so we need additional buffer + // for all temporary arrays + + // columns buffer + *buffer_size += sizeof(rocsparse_int) * nnz; + // perm buffer + *buffer_size += sizeof(rocsparse_int) * nnz; + return rocsparse_status_success; } @@ -162,18 +170,68 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, hipStream_t stream = handle->stream; unsigned int startbit = 0; - unsigned int endbit = 32 - __builtin_clz(n); + unsigned int endbit = rocsparse_clz(n); size_t size; #if defined(__HIP_PLATFORM_HCC__) - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1> >; + rocprim::segmented_radix_sort_pairs(nullptr, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + + // Temporary buffer entry points + char* ptr = reinterpret_cast(temp_buffer); + ptr += size; + + // columns buffer + rocsparse_int* tmp_cols = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * nnz; + + // perm buffer + rocsparse_int* tmp_perm = reinterpret_cast(ptr); - rocprim::segmented_radix_sort_pairs(nullptr, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); - rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + // Sort by columns and obtain permutation vector + + // Determine blocksize and items per thread depending on average nnz per row + rocsparse_int avg_row_nnz = nnz / m; + + if(avg_row_nnz < 64) + { + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1> >; + rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + } + else if(avg_row_nnz < 128) + { + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2> >; + rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + } + else if(avg_row_nnz < 256) + { + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4> >; + rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + } + else + { + rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + } #elif defined(__HIP_PLATFORM_NVCC__) hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); - hipcub::DeviceSegmentedRadixSort::SortPairs(temp_buffer, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + + // Temporary buffer entry points + char* ptr = reinterpret_cast(temp_buffer); + ptr += size; + + // columns buffer + rocsparse_int* tmp_cols = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * nnz; + + // perm buffer + rocsparse_int* tmp_perm = reinterpret_cast(ptr); + + // Sort by columns and obtain permutation vector + hipcub::DeviceSegmentedRadixSort::SortPairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); #endif + // Extract results from buffer + hipMemcpy(csr_col_ind, tmp_cols, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice); + hipMemcpy(perm, tmp_perm, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice); + return rocsparse_status_success; } From e9416da8017b1edd57565a3f57c128cdbeb1cec0 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 25 Jun 2018 12:36:15 +0200 Subject: [PATCH 134/304] clang format --- clients/include/testing_csrsort.hpp | 120 +++++++++------ clients/tests/test_csrsort.cpp | 6 +- library/include/rocsparse-functions.h | 8 +- library/src/conversion/rocsparse_csrsort.cpp | 150 ++++++++++++++++--- library/src/include/utility.h | 10 +- 5 files changed, 211 insertions(+), 83 deletions(-) diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp index e2a6f486..09ff5eec 100644 --- a/clients/include/testing_csrsort.hpp +++ b/clients/include/testing_csrsort.hpp @@ -59,7 +59,8 @@ void testing_csrsort_bad_arg(void) { rocsparse_int* csr_row_ptr_null = nullptr; - status = rocsparse_csrsort_buffer_size(handle, m, n, nnz, csr_row_ptr_null, csr_col_ind, &buffer_size); + status = rocsparse_csrsort_buffer_size( + handle, m, n, nnz, csr_row_ptr_null, csr_col_ind, &buffer_size); verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); } @@ -67,7 +68,8 @@ void testing_csrsort_bad_arg(void) { rocsparse_int* csr_col_ind_null = nullptr; - status = rocsparse_csrsort_buffer_size(handle, m, n, nnz, csr_row_ptr, csr_col_ind_null, &buffer_size); + status = rocsparse_csrsort_buffer_size( + handle, m, n, nnz, csr_row_ptr, csr_col_ind_null, &buffer_size); verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); } @@ -75,7 +77,8 @@ void testing_csrsort_bad_arg(void) { size_t* buffer_size_null = nullptr; - status = rocsparse_csrsort_buffer_size(handle, m, n, nnz, csr_row_ptr, csr_col_ind, buffer_size_null); + status = rocsparse_csrsort_buffer_size( + handle, m, n, nnz, csr_row_ptr, csr_col_ind, buffer_size_null); verify_rocsparse_status_invalid_pointer(status, "Error: buffer_size is nullptr"); } @@ -83,7 +86,8 @@ void testing_csrsort_bad_arg(void) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csrsort_buffer_size(handle_null, m, n, nnz, csr_row_ptr, csr_col_ind, &buffer_size); + status = rocsparse_csrsort_buffer_size( + handle_null, m, n, nnz, csr_row_ptr, csr_col_ind, &buffer_size); verify_rocsparse_status_invalid_handle(status); } @@ -93,7 +97,8 @@ void testing_csrsort_bad_arg(void) { rocsparse_int* csr_row_ptr_null = nullptr; - status = rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr_null, csr_col_ind, perm, buffer); + status = rocsparse_csrsort( + handle, m, n, nnz, descr, csr_row_ptr_null, csr_col_ind, perm, buffer); verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); } @@ -101,7 +106,8 @@ void testing_csrsort_bad_arg(void) { rocsparse_int* csr_col_ind_null = nullptr; - status = rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind_null, perm, buffer); + status = rocsparse_csrsort( + handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind_null, perm, buffer); verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); } @@ -109,7 +115,8 @@ void testing_csrsort_bad_arg(void) { rocsparse_int* perm_null = nullptr; - status = rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm_null, buffer); + status = rocsparse_csrsort( + handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm_null, buffer); verify_rocsparse_status_invalid_pointer(status, "Error: perm is nullptr"); } @@ -117,7 +124,8 @@ void testing_csrsort_bad_arg(void) { rocsparse_int* buffer_null = nullptr; - status = rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm, buffer_null); + status = rocsparse_csrsort( + handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm, buffer_null); verify_rocsparse_status_invalid_pointer(status, "Error: buffer is nullptr"); } @@ -125,7 +133,8 @@ void testing_csrsort_bad_arg(void) { rocsparse_mat_descr descr_null = nullptr; - status = rocsparse_csrsort(handle, m, n, nnz, descr_null, csr_row_ptr, csr_col_ind, perm, buffer); + status = rocsparse_csrsort( + handle, m, n, nnz, descr_null, csr_row_ptr, csr_col_ind, perm, buffer); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } @@ -133,16 +142,17 @@ void testing_csrsort_bad_arg(void) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csrsort(handle_null, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm, buffer); + status = rocsparse_csrsort( + handle_null, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm, buffer); verify_rocsparse_status_invalid_handle(status); } } rocsparse_status testing_csrsort(Arguments argus) { - rocsparse_int m = argus.M; - rocsparse_int n = argus.N; - rocsparse_int safe_size = 100; + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; rocsparse_status status; size_t buffer_size = 0; @@ -178,7 +188,8 @@ rocsparse_status testing_csrsort(Arguments argus) return rocsparse_status_memory_error; } - status = rocsparse_csrsort_buffer_size(handle, m, n, nnz, csr_row_ptr, csr_col_ind, &buffer_size); + status = rocsparse_csrsort_buffer_size( + handle, m, n, nnz, csr_row_ptr, csr_col_ind, &buffer_size); if(m < 0 || n < 0 || nnz < 0) { @@ -237,8 +248,8 @@ rocsparse_status testing_csrsort(Arguments argus) { rocsparse_int rng = row_begin + rand() % row_nnz; - rocsparse_int temp_col = hcsr_col_ind_unsorted[j]; - float temp_val = hcsr_val_unsorted[j]; + rocsparse_int temp_col = hcsr_col_ind_unsorted[j]; + float temp_val = hcsr_val_unsorted[j]; hcsr_col_ind_unsorted[j] = hcsr_col_ind_unsorted[rng]; hcsr_val_unsorted[j] = hcsr_val_unsorted[rng]; @@ -253,17 +264,17 @@ rocsparse_status testing_csrsort(Arguments argus) rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; auto dcsr_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; - auto dcsr_val_managed = - rocsparse_unique_ptr{device_malloc(sizeof(float) * nnz), device_free}; + auto dcsr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(float) * nnz), device_free}; auto dcsr_val_sorted_managed = rocsparse_unique_ptr{device_malloc(sizeof(float) * nnz), device_free}; - auto dperm_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dperm_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; rocsparse_int* dcsr_row_ptr = (rocsparse_int*)dcsr_row_ptr_managed.get(); rocsparse_int* dcsr_col_ind = (rocsparse_int*)dcsr_col_ind_managed.get(); - float* dcsr_val = (float*)dcsr_val_managed.get(); - float* dcsr_val_sorted = (float*)dcsr_val_sorted_managed.get(); - rocsparse_int* dperm = (rocsparse_int*)dperm_managed.get(); + float* dcsr_val = (float*)dcsr_val_managed.get(); + float* dcsr_val_sorted = (float*)dcsr_val_sorted_managed.get(); + rocsparse_int* dperm = (rocsparse_int*)dperm_managed.get(); if(!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val || !dcsr_val_sorted || !dperm) { @@ -274,17 +285,24 @@ rocsparse_status testing_csrsort(Arguments argus) } // Copy data from host to device - CHECK_HIP_ERROR(hipMemcpy(dcsr_row_ptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dcsr_col_ind, hcsr_col_ind_unsorted.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dcsr_val, hcsr_val_unsorted.data(), sizeof(float) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy( + dcsr_row_ptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_col_ind, + hcsr_col_ind_unsorted.data(), + sizeof(rocsparse_int) * nnz, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dcsr_val, hcsr_val_unsorted.data(), sizeof(float) * nnz, hipMemcpyHostToDevice)); if(argus.unit_check) { // Obtain buffer size - CHECK_ROCSPARSE_ERROR(rocsparse_csrsort_buffer_size(handle, m, n, nnz, dcsr_row_ptr, dcsr_col_ind, &buffer_size)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrsort_buffer_size( + handle, m, n, nnz, dcsr_row_ptr, dcsr_col_ind, &buffer_size)); // Allocate buffer on the device - auto dbuffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * buffer_size), device_free}; + auto dbuffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * buffer_size), device_free}; void* dbuffer = (void*)dbuffer_managed.get(); @@ -298,14 +316,20 @@ rocsparse_status testing_csrsort(Arguments argus) CHECK_ROCSPARSE_ERROR(rocsparse_create_identity_permutation(handle, nnz, dperm)); // Sort CSR columns - CHECK_ROCSPARSE_ERROR(rocsparse_csrsort(handle, m, n, nnz, descr, dcsr_row_ptr, dcsr_col_ind, dperm, dbuffer)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrsort( + handle, m, n, nnz, descr, dcsr_row_ptr, dcsr_col_ind, dperm, dbuffer)); // Sort CSR values - CHECK_ROCSPARSE_ERROR(rocsparse_sgthr(handle, nnz, dcsr_val, dcsr_val_sorted, dperm, rocsparse_index_base_zero)); + CHECK_ROCSPARSE_ERROR(rocsparse_sgthr( + handle, nnz, dcsr_val, dcsr_val_sorted, dperm, rocsparse_index_base_zero)); // Copy output from device to host - CHECK_HIP_ERROR(hipMemcpy(hcsr_col_ind_unsorted.data(), dcsr_col_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hcsr_val_unsorted.data(), dcsr_val_sorted, sizeof(float) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcsr_col_ind_unsorted.data(), + dcsr_col_ind, + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + hcsr_val_unsorted.data(), dcsr_val_sorted, sizeof(float) * nnz, hipMemcpyDeviceToHost)); // Unit check unit_check_general(1, nnz, hcsr_col_ind.data(), hcsr_col_ind_unsorted.data()); @@ -314,29 +338,29 @@ rocsparse_status testing_csrsort(Arguments argus) if(argus.timing) { -/* TODO - rocsparse_int number_cold_calls = 2; - rocsparse_int number_hot_calls = argus.iters; + /* TODO + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; - for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) - { - rocsparse_csrsort(handle, dcsr_row_ptr, nnz, m, dcoo_row_ind, idx_base); - } + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_csrsort(handle, dcsr_row_ptr, nnz, m, dcoo_row_ind, idx_base); + } - double gpu_time_used = get_time_us(); + double gpu_time_used = get_time_us(); - for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) - { - rocsparse_csrsort(handle, dcsr_row_ptr, nnz, m, dcoo_row_ind, idx_base); - } + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_csrsort(handle, dcsr_row_ptr, nnz, m, dcoo_row_ind, idx_base); + } - gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); - double bandwidth = sizeof(rocsparse_int) * (nnz + m + 1) / gpu_time_used / 1e6; + double bandwidth = sizeof(rocsparse_int) * (nnz + m + 1) / gpu_time_used / 1e6; - printf("m\t\tn\t\tnnz\t\tGB/s\tmsec\n"); - printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\n", m, n, nnz, bandwidth, gpu_time_used); -*/ + printf("m\t\tn\t\tnnz\t\tGB/s\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\n", m, n, nnz, bandwidth, gpu_time_used); + */ } return rocsparse_status_success; } diff --git a/clients/tests/test_csrsort.cpp b/clients/tests/test_csrsort.cpp index 86ab49c8..49bb468e 100644 --- a/clients/tests/test_csrsort.cpp +++ b/clients/tests/test_csrsort.cpp @@ -26,9 +26,9 @@ class parameterized_csrsort : public testing::TestWithParam Arguments setup_csrsort_arguments(csrsort_tuple tup) { Arguments arg; - arg.M = std::get<0>(tup); - arg.N = std::get<1>(tup); - arg.timing = 0; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.timing = 0; return arg; } diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 828d8c1e..f5b8cb8b 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -1129,10 +1129,10 @@ rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handle, /*! \brief SPARSE Format Conversions API \details - csrsort sorts a matrix in CSR format in-place. csrsort requires a - temporary storage buffer. The sorted permutation vector perm can be - used to obtain sorted csr_val array. In this case, P must be - initialized as the identity permutation 0:1:(nnz-1). + csrsort sorts a matrix in CSR format. csrsort requires a temporary + storage buffer. The sorted permutation vector perm can be used to + obtain sorted csr_val array. In this case, P must be initialized + as the identity permutation 0:1:(nnz-1). @param[in] handle rocsparse_handle. diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 4b4d0b0c..01ee4bd0 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -3,6 +3,7 @@ * ************************************************************************ */ #include "rocsparse.h" +#include "definitions.h" #include "handle.h" #include "utility.h" @@ -29,7 +30,8 @@ extern "C" rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handl } // Logging TODO bench logging - log_trace(handle, "rocsparse_csrsort_buffer_size", + log_trace(handle, + "rocsparse_csrsort_buffer_size", m, n, nnz, @@ -77,13 +79,34 @@ extern "C" rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handl rocsparse_int* null_ptr = nullptr; -// TODO config required for buffer?? #if defined(__HIP_PLATFORM_HCC__) - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1> >; - - rocprim::segmented_radix_sort_pairs(nullptr, *buffer_size, null_ptr, null_ptr, null_ptr, null_ptr, nnz, m, null_ptr, null_ptr, 0, 32, stream); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, + *buffer_size, + null_ptr, + null_ptr, + null_ptr, + null_ptr, + nnz, + m, + null_ptr, + null_ptr, + 0, + 32, + stream)); #elif defined(__HIP_PLATFORM_NVCC__) - hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, *buffer_size, null_ptr, null_ptr, null_ptr, null_ptr, nnz, m, null_ptr, null_ptr, 0, 32, stream); + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, + *buffer_size, + null_ptr, + null_ptr, + null_ptr, + null_ptr, + nnz, + m, + null_ptr, + null_ptr, + 0, + 32, + stream)); #endif // rocPRIM does not support in-place sorting, so we need additional buffer @@ -118,7 +141,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, } // Logging TODO bench logging - log_trace(handle, "rocsparse_csrsort_buffer_size", + log_trace(handle, + "rocsparse_csrsort_buffer_size", m, n, nnz, @@ -170,11 +194,23 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, hipStream_t stream = handle->stream; unsigned int startbit = 0; - unsigned int endbit = rocsparse_clz(n); + unsigned int endbit = rocsparse_clz(n); size_t size; #if defined(__HIP_PLATFORM_HCC__) - rocprim::segmented_radix_sort_pairs(nullptr, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, + size, + csr_col_ind, + csr_col_ind, + perm, + perm, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); // Temporary buffer entry points char* ptr = reinterpret_cast(temp_buffer); @@ -194,25 +230,85 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, if(avg_row_nnz < 64) { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1> >; - rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + size, + csr_col_ind, + tmp_cols, + perm, + tmp_perm, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); } else if(avg_row_nnz < 128) { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2> >; - rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + size, + csr_col_ind, + tmp_cols, + perm, + tmp_perm, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); } else if(avg_row_nnz < 256) { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4> >; - rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + size, + csr_col_ind, + tmp_cols, + perm, + tmp_perm, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); } else { - rocprim::segmented_radix_sort_pairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + size, + csr_col_ind, + tmp_cols, + perm, + tmp_perm, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); } #elif defined(__HIP_PLATFORM_NVCC__) - hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, size, csr_col_ind, csr_col_ind, perm, perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, + size, + csr_col_ind, + csr_col_ind, + perm, + perm, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); // Temporary buffer entry points char* ptr = reinterpret_cast(temp_buffer); @@ -226,12 +322,26 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, rocsparse_int* tmp_perm = reinterpret_cast(ptr); // Sort by columns and obtain permutation vector - hipcub::DeviceSegmentedRadixSort::SortPairs(temp_buffer, size, csr_col_ind, tmp_cols, perm, tmp_perm, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream); + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(temp_buffer, + size, + csr_col_ind, + tmp_cols, + perm, + tmp_perm, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); #endif // Extract results from buffer - hipMemcpy(csr_col_ind, tmp_cols, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice); - hipMemcpy(perm, tmp_perm, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice); + RETURN_IF_HIP_ERROR( + hipMemcpy(csr_col_ind, tmp_cols, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR( + hipMemcpy(perm, tmp_perm, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); return rocsparse_status_success; } diff --git a/library/src/include/utility.h b/library/src/include/utility.h index d2c1ab29..fe4aa6db 100644 --- a/library/src/include/utility.h +++ b/library/src/include/utility.h @@ -16,15 +16,9 @@ // Return the leftmost significant bit position #if defined(rocsparse_ILP64) -static inline rocsparse_int rocsparse_clz(rocsparse_int n) -{ - return 64 - __builtin_clzll(n); -} +static inline rocsparse_int rocsparse_clz(rocsparse_int n) { return 64 - __builtin_clzll(n); } #else -static inline rocsparse_int rocsparse_clz(rocsparse_int n) -{ - return 32 - __builtin_clz(n); -} +static inline rocsparse_int rocsparse_clz(rocsparse_int n) { return 32 - __builtin_clz(n); } #endif // if trace logging is turned on with From ab723bfb3dded404566f8709627c750ab2e19b83 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 25 Jun 2018 13:38:19 +0200 Subject: [PATCH 135/304] coo2csr benchmark can now also read mtx files and create laplacian matrices --- clients/include/testing_coo2csr.hpp | 43 +++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/clients/include/testing_coo2csr.hpp b/clients/include/testing_coo2csr.hpp index 29a65432..1d33543c 100644 --- a/clients/include/testing_coo2csr.hpp +++ b/clients/include/testing_coo2csr.hpp @@ -118,15 +118,46 @@ rocsparse_status testing_coo2csr(Arguments argus) } // Host structures - std::vector hcoo_row_ind(nnz); - std::vector hcoo_col_ind(nnz); - std::vector hcoo_val(nnz); - std::vector hcsr_row_ptr(m + 1); - std::vector hcsr_row_ptr_gold(m + 1, 0); + std::vector hcoo_row_ind; + std::vector hcoo_col_ind; + std::vector hcoo_val; // Sample initial COO matrix on CPU srand(12345ULL); - gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base); + if(argus.laplacian) + { + std::vector hptr(m + 1); + m = n = gen_2d_laplacian(argus.laplacian, hptr, hcoo_col_ind, hcoo_val, idx_base); + nnz = hptr[m]; + hcoo_row_ind.resize(nnz); + + // Convert to COO + for(rocsparse_int i = 0; i < m; ++i) + { + for(rocsparse_int j = hptr[i]; j < hptr[i + 1]; ++j) + { + hcoo_row_ind[j - idx_base] = i + idx_base; + } + } + } + else + { + if(argus.filename != "") + { + if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base); + } + } + + std::vector hcsr_row_ptr(m + 1); + std::vector hcsr_row_ptr_gold(m + 1, 0); // Allocate memory on the device auto dcoo_row_ind_managed = From 95a6d94d45ea0f17e99e653d80ed7f557b947874 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 25 Jun 2018 14:21:48 +0200 Subject: [PATCH 136/304] fix for benchmark option verify=1 --- clients/common/unit.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index 8eba8eaf..d7533216 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -5,11 +5,18 @@ #include "unit.hpp" #include -#include #include #ifdef GOOGLE_TEST #include +#else +#ifdef NDEBUG +#undef NDEBUG +#include +#define NDEBUG +#else +#include +#endif #endif /* ========================================Gtest Unit Check From 12b68a9564385ca8599efb840a8179b0f857eb52 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 25 Jun 2018 17:15:08 +0200 Subject: [PATCH 137/304] double buffering for csrsort --- library/src/conversion/coo2csr_device.h | 8 +- library/src/conversion/rocsparse_csrsort.cpp | 150 ++++++++----------- 2 files changed, 69 insertions(+), 89 deletions(-) diff --git a/library/src/conversion/coo2csr_device.h b/library/src/conversion/coo2csr_device.h index 5a294183..676a7174 100644 --- a/library/src/conversion/coo2csr_device.h +++ b/library/src/conversion/coo2csr_device.h @@ -9,10 +9,10 @@ #include // Compute lower bound by binary search -__device__ rocsparse_int lower_bound(const rocsparse_int* arr, - rocsparse_int key, - rocsparse_int low, - rocsparse_int high) +static inline __device__ rocsparse_int lower_bound(const rocsparse_int* arr, + rocsparse_int key, + rocsparse_int low, + rocsparse_int high) { if(low > high) { diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 01ee4bd0..47cac931 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -77,36 +77,17 @@ extern "C" rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handl // Stream hipStream_t stream = handle->stream; - rocsparse_int* null_ptr = nullptr; - + rocsparse_int* ptr = reinterpret_cast(buffer_size); #if defined(__HIP_PLATFORM_HCC__) - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, - *buffer_size, - null_ptr, - null_ptr, - null_ptr, - null_ptr, - nnz, - m, - null_ptr, - null_ptr, - 0, - 32, - stream)); + rocprim::double_buffer dummy(ptr, ptr); + + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( + nullptr, *buffer_size, dummy, dummy, nnz, m, buffer_size, buffer_size, 0, 32, stream)); #elif defined(__HIP_PLATFORM_NVCC__) - RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, - *buffer_size, - null_ptr, - null_ptr, - null_ptr, - null_ptr, - nnz, - m, - null_ptr, - null_ptr, - 0, - 32, - stream)); + hipcub::DoubleBuffer dummy(ptr, ptr); + + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs( + nullptr, *buffer_size, dummy, dummy, nnz, m, buffer_size, buffer_size, 0, 32, stream)); #endif // rocPRIM does not support in-place sorting, so we need additional buffer @@ -198,12 +179,12 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, size_t size; #if defined(__HIP_PLATFORM_HCC__) + rocprim::double_buffer dummy(csr_col_ind, perm); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, size, - csr_col_ind, - csr_col_ind, - perm, - perm, + dummy, + dummy, nnz, m, csr_row_ptr, @@ -211,6 +192,21 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, startbit, endbit, stream)); +#elif defined(__HIP_PLATFORM_NVCC__) + hipcub::DoubleBuffer dummy(csr_col_ind, perm); + + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, + size, + dummy, + dummy, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); +#endif // Temporary buffer entry points char* ptr = reinterpret_cast(temp_buffer); @@ -223,7 +219,11 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, // perm buffer rocsparse_int* tmp_perm = reinterpret_cast(ptr); - // Sort by columns and obtain permutation vector +// Sort by columns and obtain permutation vector + +#if defined(__HIP_PLATFORM_HCC__) + rocprim::double_buffer keys(csr_col_ind, tmp_cols); + rocprim::double_buffer vals(perm, tmp_perm); // Determine blocksize and items per thread depending on average nnz per row rocsparse_int avg_row_nnz = nnz / m; @@ -233,10 +233,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, size, - csr_col_ind, - tmp_cols, - perm, - tmp_perm, + keys, + vals, nnz, m, csr_row_ptr, @@ -250,10 +248,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, size, - csr_col_ind, - tmp_cols, - perm, - tmp_perm, + keys, + vals, nnz, m, csr_row_ptr, @@ -267,10 +263,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, size, - csr_col_ind, - tmp_cols, - perm, - tmp_perm, + keys, + vals, nnz, m, csr_row_ptr, @@ -283,10 +277,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, { RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, size, - csr_col_ind, - tmp_cols, - perm, - tmp_perm, + keys, + vals, nnz, m, csr_row_ptr, @@ -295,39 +287,24 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, endbit, stream)); } + if(keys.current() != csr_col_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy( + csr_col_ind, keys.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + if(vals.current() != perm) + { + RETURN_IF_HIP_ERROR( + hipMemcpy(perm, vals.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } #elif defined(__HIP_PLATFORM_NVCC__) - RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, - size, - csr_col_ind, - csr_col_ind, - perm, - perm, - nnz, - m, - csr_row_ptr, - csr_row_ptr + 1, - startbit, - endbit, - stream)); - - // Temporary buffer entry points - char* ptr = reinterpret_cast(temp_buffer); - ptr += size; - - // columns buffer - rocsparse_int* tmp_cols = reinterpret_cast(ptr); - ptr += sizeof(rocsparse_int) * nnz; - - // perm buffer - rocsparse_int* tmp_perm = reinterpret_cast(ptr); + hipcub::DoubleBuffer keys(csr_col_ind, tmp_cols); + hipcub::DoubleBuffer vals(perm, tmp_perm); - // Sort by columns and obtain permutation vector RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(temp_buffer, size, - csr_col_ind, - tmp_cols, - perm, - tmp_perm, + keys, + vals, nnz, m, csr_row_ptr, @@ -335,13 +312,16 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, startbit, endbit, stream)); + if(keys.Current() != csr_col_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy( + csr_col_ind, keys.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + if(vals.Current() != perm) + { + RETURN_IF_HIP_ERROR( + hipMemcpy(perm, vals.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } #endif - - // Extract results from buffer - RETURN_IF_HIP_ERROR( - hipMemcpy(csr_col_ind, tmp_cols, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); - RETURN_IF_HIP_ERROR( - hipMemcpy(perm, tmp_perm, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); - return rocsparse_status_success; } From 62bead77b128df09bb3f4c853a67fc316b6b5e50 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 26 Jun 2018 08:41:48 +0200 Subject: [PATCH 138/304] index_base_one support for csrsort() --- clients/include/testing_csrsort.hpp | 19 +++++--- clients/tests/test_csrsort.cpp | 17 ++++--- library/src/conversion/rocsparse_csrsort.cpp | 50 ++++++++++++++++---- 3 files changed, 62 insertions(+), 24 deletions(-) diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp index 09ff5eec..772867b0 100644 --- a/clients/include/testing_csrsort.hpp +++ b/clients/include/testing_csrsort.hpp @@ -150,9 +150,10 @@ void testing_csrsort_bad_arg(void) rocsparse_status testing_csrsort(Arguments argus) { - rocsparse_int m = argus.M; - rocsparse_int n = argus.N; - rocsparse_int safe_size = 100; + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; size_t buffer_size = 0; @@ -170,6 +171,9 @@ rocsparse_status testing_csrsort(Arguments argus) std::unique_ptr unique_ptr_descr(new descr_struct); rocsparse_mat_descr descr = unique_ptr_descr->descr; + // Set matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); + // Argument sanity check before allocating invalid memory if(m <= 0 || n <= 0 || nnz <= 0) { @@ -217,14 +221,15 @@ rocsparse_status testing_csrsort(Arguments argus) // Sample initial COO matrix on CPU srand(12345ULL); - gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, rocsparse_index_base_zero); + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base); // Convert COO to CSR for(rocsparse_int i = 0; i < nnz; ++i) { - ++hcsr_row_ptr[hcoo_row_ind[i] + 1]; + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; } + hcsr_row_ptr[0] = idx_base; for(rocsparse_int i = 0; i < m; ++i) { hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; @@ -240,8 +245,8 @@ rocsparse_status testing_csrsort(Arguments argus) for(rocsparse_int i = 0; i < m; ++i) { - rocsparse_int row_begin = hcsr_row_ptr[i]; - rocsparse_int row_end = hcsr_row_ptr[i + 1]; + rocsparse_int row_begin = hcsr_row_ptr[i] - idx_base; + rocsparse_int row_end = hcsr_row_ptr[i + 1] - idx_base; rocsparse_int row_nnz = row_end - row_begin; for(rocsparse_int j = row_begin; j < row_end; ++j) diff --git a/clients/tests/test_csrsort.cpp b/clients/tests/test_csrsort.cpp index 49bb468e..ad16ae67 100644 --- a/clients/tests/test_csrsort.cpp +++ b/clients/tests/test_csrsort.cpp @@ -9,10 +9,11 @@ #include #include -typedef std::tuple csrsort_tuple; +typedef std::tuple csrsort_tuple; -int csrsort_M_range[] = {-1, 0, 10, 500, 872, 1000}; -int csrsort_N_range[] = {-3, 0, 33, 242, 623, 1000}; +int csrsort_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int csrsort_N_range[] = {-3, 0, 33, 242, 623, 1000}; +rocsparse_index_base csrsort_base[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; class parameterized_csrsort : public testing::TestWithParam { @@ -26,9 +27,10 @@ class parameterized_csrsort : public testing::TestWithParam Arguments setup_csrsort_arguments(csrsort_tuple tup) { Arguments arg; - arg.M = std::get<0>(tup); - arg.N = std::get<1>(tup); - arg.timing = 0; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; return arg; } @@ -45,4 +47,5 @@ TEST_P(parameterized_csrsort, csrsort) INSTANTIATE_TEST_CASE_P(csrsort, parameterized_csrsort, testing::Combine(testing::ValuesIn(csrsort_M_range), - testing::ValuesIn(csrsort_N_range))); + testing::ValuesIn(csrsort_N_range), + testing::ValuesIn(csrsort_base))); diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 47cac931..3bac0e3c 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -6,6 +6,7 @@ #include "definitions.h" #include "handle.h" #include "utility.h" +#include "csrsort_device.h" #include @@ -97,6 +98,8 @@ extern "C" rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handl *buffer_size += sizeof(rocsparse_int) * nnz; // perm buffer *buffer_size += sizeof(rocsparse_int) * nnz; + // segm buffer + *buffer_size += sizeof(rocsparse_int) * (m + 1); return rocsparse_status_success; } @@ -218,6 +221,33 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, // perm buffer rocsparse_int* tmp_perm = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * nnz; + + // segm buffer + rocsparse_int* tmp_segm = nullptr; + + // Index base one requires shift of offset positions + if(descr->base == rocsparse_index_base_one) + { + tmp_segm = reinterpret_cast(ptr); + +#define CSRSORT_DIM 512 + dim3 csrsort_blocks(m / CSRSORT_DIM + 1); + dim3 csrsort_threads(CSRSORT_DIM); + + hipLaunchKernelGGL((csrsort_shift_kernel), + csrsort_blocks, + csrsort_threads, + 0, + stream, + m + 1, + csr_row_ptr, + tmp_segm); +#undef CSRSORT_DIM + } + + // Switch between offsets + const rocsparse_int* offsets = tmp_segm ? tmp_segm : csr_row_ptr; // Sort by columns and obtain permutation vector @@ -237,8 +267,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, vals, nnz, m, - csr_row_ptr, - csr_row_ptr + 1, + offsets, + offsets + 1, startbit, endbit, stream)); @@ -252,8 +282,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, vals, nnz, m, - csr_row_ptr, - csr_row_ptr + 1, + offsets, + offsets + 1, startbit, endbit, stream)); @@ -267,8 +297,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, vals, nnz, m, - csr_row_ptr, - csr_row_ptr + 1, + offsets, + offsets + 1, startbit, endbit, stream)); @@ -281,8 +311,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, vals, nnz, m, - csr_row_ptr, - csr_row_ptr + 1, + offsets, + offsets + 1, startbit, endbit, stream)); @@ -307,8 +337,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, vals, nnz, m, - csr_row_ptr, - csr_row_ptr + 1, + offsets, + offsets + 1, startbit, endbit, stream)); From ff4074b0150eb7ec5671855028a00bf8987e96c4 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 26 Jun 2018 08:42:30 +0200 Subject: [PATCH 139/304] clang format --- library/src/conversion/rocsparse_csrsort.cpp | 65 +++----------------- 1 file changed, 10 insertions(+), 55 deletions(-) diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 3bac0e3c..7f1c53db 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -261,61 +261,25 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, if(avg_row_nnz < 64) { using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, - size, - keys, - vals, - nnz, - m, - offsets, - offsets + 1, - startbit, - endbit, - stream)); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( + temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); } else if(avg_row_nnz < 128) { using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, - size, - keys, - vals, - nnz, - m, - offsets, - offsets + 1, - startbit, - endbit, - stream)); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( + temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); } else if(avg_row_nnz < 256) { using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, - size, - keys, - vals, - nnz, - m, - offsets, - offsets + 1, - startbit, - endbit, - stream)); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( + temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); } else { - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, - size, - keys, - vals, - nnz, - m, - offsets, - offsets + 1, - startbit, - endbit, - stream)); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( + temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); } if(keys.current() != csr_col_ind) { @@ -331,17 +295,8 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, hipcub::DoubleBuffer keys(csr_col_ind, tmp_cols); hipcub::DoubleBuffer vals(perm, tmp_perm); - RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(temp_buffer, - size, - keys, - vals, - nnz, - m, - offsets, - offsets + 1, - startbit, - endbit, - stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs( + temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); if(keys.Current() != csr_col_ind) { RETURN_IF_HIP_ERROR(hipMemcpy( From 0b7d2dd0b7dbafe465ebe60ca120e718c6995cb3 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 26 Jun 2018 09:27:22 +0200 Subject: [PATCH 140/304] csrsort(): added benchmark ; support to execute csrsort() without permutation vector (perm == nullptr) --- clients/benchmarks/client.cpp | 8 +- clients/include/testing_csrsort.hpp | 138 ++++++---- clients/include/utility.hpp | 2 + clients/tests/test_csrsort.cpp | 7 +- library/src/conversion/csrsort_device.h | 25 ++ library/src/conversion/rocsparse_csrsort.cpp | 268 +++++++++++++------ 6 files changed, 315 insertions(+), 133 deletions(-) create mode 100644 library/src/conversion/csrsort_device.h diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 62c89ef8..cf8d4737 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -25,6 +25,7 @@ #include "testing_csr2hyb.hpp" #include "testing_coo2csr.hpp" #include "testing_identity.hpp" +#include "testing_csrsort.hpp" #include #include @@ -82,7 +83,8 @@ int main(int argc, char* argv[]) "SPARSE function to test. Options:\n" " Level1: axpyi, doti, gthr, gthrz, roti, sctr\n" " Level2: coomv, csrmv, ellmv, hybmv\n" - " Conversion: csr2coo, csr2ell, csr2hyb, coo2csr") + " Conversion: csr2coo, csr2ell, csr2hyb, coo2csr\n" + " Sorting: csrsort") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") @@ -230,6 +232,10 @@ int main(int argc, char* argv[]) { testing_coo2csr(argus); } + else if(function == "csrsort") + { + testing_csrsort(argus); + } else { fprintf(stderr, "Invalid value for --function\n"); diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp index 772867b0..66218900 100644 --- a/clients/include/testing_csrsort.hpp +++ b/clients/include/testing_csrsort.hpp @@ -111,15 +111,6 @@ void testing_csrsort_bad_arg(void) verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); } - // Testing for (perm == nullptr) - { - rocsparse_int* perm_null = nullptr; - - status = rocsparse_csrsort( - handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm_null, buffer); - verify_rocsparse_status_invalid_pointer(status, "Error: perm is nullptr"); - } - // Testing for (buffer == nullptr) { rocsparse_int* buffer_null = nullptr; @@ -153,6 +144,7 @@ rocsparse_status testing_csrsort(Arguments argus) rocsparse_int m = argus.M; rocsparse_int n = argus.N; rocsparse_int safe_size = 100; + rocsparse_int permute = argus.temp; rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; @@ -214,25 +206,46 @@ rocsparse_status testing_csrsort(Arguments argus) // For testing, assemble a COO matrix and convert it to CSR first (on host) // Host structures - std::vector hcsr_row_ptr(m + 1, 0); - std::vector hcoo_row_ind(nnz); - std::vector hcsr_col_ind(nnz); - std::vector hcsr_val(nnz); + std::vector hcsr_row_ptr; + std::vector hcoo_row_ind; + std::vector hcsr_col_ind; + std::vector hcsr_val; // Sample initial COO matrix on CPU srand(12345ULL); - gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base); - - // Convert COO to CSR - for(rocsparse_int i = 0; i < nnz; ++i) + if(argus.laplacian) { - ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base); + nnz = hcsr_row_ptr[m]; } - - hcsr_row_ptr[0] = idx_base; - for(rocsparse_int i = 0; i < m; ++i) + else { - hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + if(argus.filename != "") + { + if(read_mtx_matrix( + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base); + } + + // Convert COO to CSR + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } } // Unsort CSR columns @@ -279,13 +292,15 @@ rocsparse_status testing_csrsort(Arguments argus) rocsparse_int* dcsr_col_ind = (rocsparse_int*)dcsr_col_ind_managed.get(); float* dcsr_val = (float*)dcsr_val_managed.get(); float* dcsr_val_sorted = (float*)dcsr_val_sorted_managed.get(); - rocsparse_int* dperm = (rocsparse_int*)dperm_managed.get(); - if(!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val || !dcsr_val_sorted || !dperm) + // Set permutation vector, if asked for + rocsparse_int* dperm = permute ? (rocsparse_int*)dperm_managed.get() : nullptr; + + if(!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val || !dcsr_val_sorted || (permute && !dperm)) { verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dcsr_row_ptr || !dcsr_col_ind || " - "!dcsr_val || !dcsr_val_sorted || !dperm"); + "!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val || " + "!dcsr_val_sorted || (permute && !dperm)"); return rocsparse_status_memory_error; } @@ -317,55 +332,76 @@ rocsparse_status testing_csrsort(Arguments argus) return rocsparse_status_memory_error; } - // Initialize perm with identity permutation - CHECK_ROCSPARSE_ERROR(rocsparse_create_identity_permutation(handle, nnz, dperm)); + if(permute) + { + // Initialize perm with identity permutation + CHECK_ROCSPARSE_ERROR(rocsparse_create_identity_permutation(handle, nnz, dperm)); + } // Sort CSR columns CHECK_ROCSPARSE_ERROR(rocsparse_csrsort( handle, m, n, nnz, descr, dcsr_row_ptr, dcsr_col_ind, dperm, dbuffer)); - // Sort CSR values - CHECK_ROCSPARSE_ERROR(rocsparse_sgthr( - handle, nnz, dcsr_val, dcsr_val_sorted, dperm, rocsparse_index_base_zero)); + if(permute) + { + // Sort CSR values + CHECK_ROCSPARSE_ERROR(rocsparse_sgthr( + handle, nnz, dcsr_val, dcsr_val_sorted, dperm, rocsparse_index_base_zero)); + } // Copy output from device to host CHECK_HIP_ERROR(hipMemcpy(hcsr_col_ind_unsorted.data(), dcsr_col_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy( - hcsr_val_unsorted.data(), dcsr_val_sorted, sizeof(float) * nnz, hipMemcpyDeviceToHost)); + + if(permute) + { + CHECK_HIP_ERROR(hipMemcpy(hcsr_val_unsorted.data(), + dcsr_val_sorted, + sizeof(float) * nnz, + hipMemcpyDeviceToHost)); + } // Unit check unit_check_general(1, nnz, hcsr_col_ind.data(), hcsr_col_ind_unsorted.data()); - unit_check_general(1, nnz, hcsr_val.data(), hcsr_val_unsorted.data()); + + if(permute) + { + unit_check_general(1, nnz, hcsr_val.data(), hcsr_val_unsorted.data()); + } } if(argus.timing) { - /* TODO - rocsparse_int number_cold_calls = 2; - rocsparse_int number_hot_calls = argus.iters; + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; - for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) - { - rocsparse_csrsort(handle, dcsr_row_ptr, nnz, m, dcoo_row_ind, idx_base); - } + // Allocate buffer for csrsort + rocsparse_csrsort_buffer_size(handle, m, n, nnz, dcsr_row_ptr, dcsr_col_ind, &buffer_size); - double gpu_time_used = get_time_us(); + auto dbuffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * buffer_size), device_free}; + void* dbuffer = (void*)dbuffer_managed.get(); - for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) - { - rocsparse_csrsort(handle, dcsr_row_ptr, nnz, m, dcoo_row_ind, idx_base); - } + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_csrsort( + handle, m, n, nnz, descr, dcsr_row_ptr, dcsr_col_ind, nullptr, dbuffer); + } + + double gpu_time_used = get_time_us(); - gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_csrsort( + handle, m, n, nnz, descr, dcsr_row_ptr, dcsr_col_ind, nullptr, dbuffer); + } - double bandwidth = sizeof(rocsparse_int) * (nnz + m + 1) / gpu_time_used / 1e6; + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); - printf("m\t\tn\t\tnnz\t\tGB/s\tmsec\n"); - printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\n", m, n, nnz, bandwidth, gpu_time_used); - */ + printf("m\t\tn\t\tnnz\t\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\n", m, n, nnz, gpu_time_used); } return rocsparse_status_success; } diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index 31bb4254..a0800540 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -612,6 +612,7 @@ class Arguments rocsparse_int iters = 10; rocsparse_int laplacian = 0; rocsparse_int ell_width = 0; + rocsparse_int temp = 0; std::string filename = ""; @@ -636,6 +637,7 @@ class Arguments iters = rhs.iters; laplacian = rhs.laplacian; ell_width = rhs.ell_width; + temp = rhs.temp; filename = rhs.filename; diff --git a/clients/tests/test_csrsort.cpp b/clients/tests/test_csrsort.cpp index ad16ae67..083703ab 100644 --- a/clients/tests/test_csrsort.cpp +++ b/clients/tests/test_csrsort.cpp @@ -9,10 +9,11 @@ #include #include -typedef std::tuple csrsort_tuple; +typedef std::tuple csrsort_tuple; int csrsort_M_range[] = {-1, 0, 10, 500, 872, 1000}; int csrsort_N_range[] = {-3, 0, 33, 242, 623, 1000}; +int csrsort_perm[] = {0, 1}; rocsparse_index_base csrsort_base[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; class parameterized_csrsort : public testing::TestWithParam @@ -29,7 +30,8 @@ Arguments setup_csrsort_arguments(csrsort_tuple tup) Arguments arg; arg.M = std::get<0>(tup); arg.N = std::get<1>(tup); - arg.idx_base = std::get<2>(tup); + arg.temp = std::get<2>(tup); + arg.idx_base = std::get<3>(tup); arg.timing = 0; return arg; } @@ -48,4 +50,5 @@ INSTANTIATE_TEST_CASE_P(csrsort, parameterized_csrsort, testing::Combine(testing::ValuesIn(csrsort_M_range), testing::ValuesIn(csrsort_N_range), + testing::ValuesIn(csrsort_perm), testing::ValuesIn(csrsort_base))); diff --git a/library/src/conversion/csrsort_device.h b/library/src/conversion/csrsort_device.h new file mode 100644 index 00000000..8a6e6433 --- /dev/null +++ b/library/src/conversion/csrsort_device.h @@ -0,0 +1,25 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef CSRSORT_DEVICE_H +#define CSRSORT_DEVICE_H + +#include + +// Shift CSR offsets +__global__ void +csrsort_shift_kernel(rocsparse_int size, const rocsparse_int* in, rocsparse_int* out) +{ + int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(gid >= size) + { + return; + } + + out[gid] = in[gid] - 1; +} + +#endif // CSRSORT_DEVICE_H diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 7f1c53db..4f34e76a 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -159,10 +159,6 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, { return rocsparse_status_invalid_pointer; } - else if(perm == nullptr) - { - return rocsparse_status_invalid_pointer; - } else if(temp_buffer == nullptr) { return rocsparse_status_invalid_pointer; @@ -181,35 +177,54 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, unsigned int endbit = rocsparse_clz(n); size_t size; + if(perm != nullptr) + { +// Sort pairs, if permutation vector is present #if defined(__HIP_PLATFORM_HCC__) - rocprim::double_buffer dummy(csr_col_ind, perm); - - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, - size, - dummy, - dummy, - nnz, - m, - csr_row_ptr, - csr_row_ptr + 1, - startbit, - endbit, - stream)); + rocprim::double_buffer dummy(csr_col_ind, perm); + + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, + size, + dummy, + dummy, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); #elif defined(__HIP_PLATFORM_NVCC__) - hipcub::DoubleBuffer dummy(csr_col_ind, perm); + hipcub::DoubleBuffer dummy(csr_col_ind, perm); + + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, + size, + dummy, + dummy, + nnz, + m, + csr_row_ptr, + csr_row_ptr + 1, + startbit, + endbit, + stream)); +#endif + } + else + { +// Sort keys, if no permutation vector is present +#if defined(__HIP_PLATFORM_HCC__) + rocprim::double_buffer dummy(csr_col_ind, csr_col_ind); - RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, - size, - dummy, - dummy, - nnz, - m, - csr_row_ptr, - csr_row_ptr + 1, - startbit, - endbit, - stream)); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( + nullptr, size, dummy, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream)); +#elif defined(__HIP_PLATFORM_NVCC__) + hipcub::DoubleBuffer dummy(csr_col_ind, csr_col_ind); + + ETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys( + nullptr, size, dummy, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream)); #endif + } // Temporary buffer entry points char* ptr = reinterpret_cast(temp_buffer); @@ -249,64 +264,159 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, // Switch between offsets const rocsparse_int* offsets = tmp_segm ? tmp_segm : csr_row_ptr; -// Sort by columns and obtain permutation vector + // Sort by columns and obtain permutation vector + if(perm != nullptr) + { +// Sort by pairs, if permutation vector is present #if defined(__HIP_PLATFORM_HCC__) - rocprim::double_buffer keys(csr_col_ind, tmp_cols); - rocprim::double_buffer vals(perm, tmp_perm); - - // Determine blocksize and items per thread depending on average nnz per row - rocsparse_int avg_row_nnz = nnz / m; + rocprim::double_buffer keys(csr_col_ind, tmp_cols); + rocprim::double_buffer vals(perm, tmp_perm); + + // Determine blocksize and items per thread depending on average nnz per row + rocsparse_int avg_row_nnz = nnz / m; + + if(avg_row_nnz < 64) + { + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + size, + keys, + vals, + nnz, + m, + offsets, + offsets + 1, + startbit, + endbit, + stream)); + } + else if(avg_row_nnz < 128) + { + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + size, + keys, + vals, + nnz, + m, + offsets, + offsets + 1, + startbit, + endbit, + stream)); + } + else if(avg_row_nnz < 256) + { + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + size, + keys, + vals, + nnz, + m, + offsets, + offsets + 1, + startbit, + endbit, + stream)); + } + else + { + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + size, + keys, + vals, + nnz, + m, + offsets, + offsets + 1, + startbit, + endbit, + stream)); + } + if(keys.current() != csr_col_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy( + csr_col_ind, keys.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + if(vals.current() != perm) + { + RETURN_IF_HIP_ERROR(hipMemcpy( + perm, vals.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } +#elif defined(__HIP_PLATFORM_NVCC__) + hipcub::DoubleBuffer keys(csr_col_ind, tmp_cols); + hipcub::DoubleBuffer vals(perm, tmp_perm); - if(avg_row_nnz < 64) - { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( - temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); - } - else if(avg_row_nnz < 128) - { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( - temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); - } - else if(avg_row_nnz < 256) - { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs( temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + if(keys.Current() != csr_col_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy( + csr_col_ind, keys.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + if(vals.Current() != perm) + { + RETURN_IF_HIP_ERROR(hipMemcpy( + perm, vals.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } +#endif } else { - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( - temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); - } - if(keys.current() != csr_col_ind) - { - RETURN_IF_HIP_ERROR(hipMemcpy( - csr_col_ind, keys.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); - } - if(vals.current() != perm) - { - RETURN_IF_HIP_ERROR( - hipMemcpy(perm, vals.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); - } +// Sort by keys, if no permutation vector is present +#if defined(__HIP_PLATFORM_HCC__) + rocprim::double_buffer keys(csr_col_ind, tmp_cols); + + // Determine blocksize and items per thread depending on average nnz per row + rocsparse_int avg_row_nnz = nnz / m; + + if(avg_row_nnz < 64) + { + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( + temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + } + else if(avg_row_nnz < 128) + { + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( + temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + } + else if(avg_row_nnz < 256) + { + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( + temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + } + else + { + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( + temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + } + if(keys.current() != csr_col_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy( + csr_col_ind, keys.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } #elif defined(__HIP_PLATFORM_NVCC__) - hipcub::DoubleBuffer keys(csr_col_ind, tmp_cols); - hipcub::DoubleBuffer vals(perm, tmp_perm); - - RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs( - temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); - if(keys.Current() != csr_col_ind) - { - RETURN_IF_HIP_ERROR(hipMemcpy( - csr_col_ind, keys.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); - } - if(vals.Current() != perm) - { - RETURN_IF_HIP_ERROR( - hipMemcpy(perm, vals.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); - } + hipcub::DoubleBuffer keys(csr_col_ind, tmp_cols); + + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys( + temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + if(keys.Current() != csr_col_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy( + csr_col_ind, keys.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } #endif + } return rocsparse_status_success; } From 32f97680f65359cd23dded56795ccc9a52e1228e Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 28 Jun 2018 11:23:18 +0200 Subject: [PATCH 141/304] fix in csrsort() buffers for nvcc --- library/src/conversion/rocsparse_csrsort.cpp | 27 +++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 4f34e76a..ddb9256d 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -221,14 +221,13 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, #elif defined(__HIP_PLATFORM_NVCC__) hipcub::DoubleBuffer dummy(csr_col_ind, csr_col_ind); - ETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys( + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys( nullptr, size, dummy, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream)); #endif } // Temporary buffer entry points char* ptr = reinterpret_cast(temp_buffer); - ptr += size; // columns buffer rocsparse_int* tmp_cols = reinterpret_cast(ptr); @@ -245,6 +244,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, if(descr->base == rocsparse_index_base_one) { tmp_segm = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * nnz; #define CSRSORT_DIM 512 dim3 csrsort_blocks(m / CSRSORT_DIM + 1); @@ -261,6 +261,9 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, #undef CSRSORT_DIM } + // rocprim buffer + void* tmp_rocprim = reinterpret_cast(ptr); + // Switch between offsets const rocsparse_int* offsets = tmp_segm ? tmp_segm : csr_row_ptr; @@ -280,7 +283,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, { using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys, vals, @@ -296,7 +299,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, { using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys, vals, @@ -312,7 +315,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, { using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys, vals, @@ -326,7 +329,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, } else { - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(temp_buffer, + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys, vals, @@ -353,7 +356,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, hipcub::DoubleBuffer vals(perm, tmp_perm); RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs( - temp_buffer, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + tmp_rocprim, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); if(keys.Current() != csr_col_ind) { RETURN_IF_HIP_ERROR(hipMemcpy( @@ -380,26 +383,26 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( - temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + tmp_rocprim, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); } else if(avg_row_nnz < 128) { using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( - temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + tmp_rocprim, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); } else if(avg_row_nnz < 256) { using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( - temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + tmp_rocprim, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); } else { RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( - temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + tmp_rocprim, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); } if(keys.current() != csr_col_ind) { @@ -410,7 +413,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, hipcub::DoubleBuffer keys(csr_col_ind, tmp_cols); RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys( - temp_buffer, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); + tmp_rocprim, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); if(keys.Current() != csr_col_ind) { RETURN_IF_HIP_ERROR(hipMemcpy( From 2bfd1a1a4cb6b22720bd890dd23ff8cf8cd7f62d Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 3 Jul 2018 16:34:29 +0200 Subject: [PATCH 142/304] added restrict keyword and some other performance improvements --- library/src/level2/coomv_device.h | 8 +++---- library/src/level2/ellmv_device.h | 2 +- library/src/level2/rocsparse_coomv.hpp | 30 ++++++++++++++------------ library/src/level2/rocsparse_ellmv.hpp | 16 +++++++------- 4 files changed, 29 insertions(+), 27 deletions(-) diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h index a2835b6e..069ecc66 100644 --- a/library/src/level2/coomv_device.h +++ b/library/src/level2/coomv_device.h @@ -10,7 +10,7 @@ // Scale kernel for beta != 1.0 template -__global__ void coomv_scale(rocsparse_int size, T scalar, T* data) +__global__ void coomv_scale(rocsparse_int size, T scalar, T* __restrict__ data) { rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -87,7 +87,7 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, if(idx < nnz) { row = coo_row_ind[idx] - idx_base; - val = alpha * coo_val[idx] * x[coo_col_ind[idx] - idx_base]; + val = alpha * coo_val[idx] * __ldg(x + coo_col_ind[idx] - idx_base); } else { @@ -185,8 +185,8 @@ static __device__ void segmented_blockreduce(const rocsparse_int* rows, T* vals) // Do the final block reduction of the block reduction buffers back into global memory template __global__ void coomvn_general_block_reduce(rocsparse_int nnz, - const rocsparse_int* row_block_red, - const T* val_block_red, + const rocsparse_int* __restrict__ row_block_red, + const T* __restrict__ val_block_red, T* y) { rocsparse_int tid = hipThreadIdx_x; diff --git a/library/src/level2/ellmv_device.h b/library/src/level2/ellmv_device.h index 9d9e19b5..ce61210a 100644 --- a/library/src/level2/ellmv_device.h +++ b/library/src/level2/ellmv_device.h @@ -34,7 +34,7 @@ static __device__ void ellmvn_device(rocsparse_int m, if(col >= 0 && col < n) { - sum += ell_val[idx] * x[col]; + sum += ell_val[idx] * __ldg(x + col); } else { diff --git a/library/src/level2/rocsparse_coomv.hpp b/library/src/level2/rocsparse_coomv.hpp index ae5b45f9..8a6689b1 100644 --- a/library/src/level2/rocsparse_coomv.hpp +++ b/library/src/level2/rocsparse_coomv.hpp @@ -15,16 +15,17 @@ #include template +__launch_bounds__(128) __global__ void coomvn_warp_host_pointer(rocsparse_int nnz, rocsparse_int loops, T alpha, - const rocsparse_int* coo_row_ind, - const rocsparse_int* coo_col_ind, - const T* coo_val, - const T* x, - T* y, - rocsparse_int* row_block_red, - T* val_block_red, + const rocsparse_int* __restrict__ coo_row_ind, + const rocsparse_int* __restrict__ coo_col_ind, + const T* __restrict__ coo_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ row_block_red, + T* __restrict__ val_block_red, rocsparse_index_base idx_base) { coomvn_general_warp_reduce(nnz, @@ -41,16 +42,17 @@ __global__ void coomvn_warp_host_pointer(rocsparse_int nnz, } template +__launch_bounds__(128) __global__ void coomvn_warp_device_pointer(rocsparse_int nnz, rocsparse_int loops, const T* alpha, - const rocsparse_int* coo_row_ind, - const rocsparse_int* coo_col_ind, - const T* coo_val, - const T* x, - T* y, - rocsparse_int* row_block_red, - T* val_block_red, + const rocsparse_int* __restrict__ coo_row_ind, + const rocsparse_int* __restrict__ coo_col_ind, + const T* __restrict__ coo_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ row_block_red, + T* __restrict__ val_block_red, rocsparse_index_base idx_base) { coomvn_general_warp_reduce(nnz, diff --git a/library/src/level2/rocsparse_ellmv.hpp b/library/src/level2/rocsparse_ellmv.hpp index 05adc1d3..246bd4b0 100644 --- a/library/src/level2/rocsparse_ellmv.hpp +++ b/library/src/level2/rocsparse_ellmv.hpp @@ -19,11 +19,11 @@ __global__ void ellmvn_kernel_host_pointer(rocsparse_int m, rocsparse_int n, rocsparse_int ell_width, T alpha, - const rocsparse_int* ell_col_ind, - const T* ell_val, - const T* x, + const rocsparse_int* __restrict__ ell_col_ind, + const T* __restrict__ ell_val, + const T* __restrict__ x, T beta, - T* y, + T* __restrict__ y, rocsparse_index_base idx_base) { ellmvn_device(m, n, ell_width, alpha, ell_col_ind, ell_val, x, beta, y, idx_base); @@ -34,11 +34,11 @@ __global__ void ellmvn_kernel_device_pointer(rocsparse_int m, rocsparse_int n, rocsparse_int ell_width, const T* alpha, - const rocsparse_int* ell_col_ind, - const T* ell_val, - const T* x, + const rocsparse_int* __restrict__ ell_col_ind, + const T* __restrict__ ell_val, + const T* __restrict__ x, const T* beta, - T* y, + T* __restrict__ y, rocsparse_index_base idx_base) { ellmvn_device(m, n, ell_width, *alpha, ell_col_ind, ell_val, x, *beta, y, idx_base); From 0b84b6855a95b63e8be9071f79fc2226acd2c646 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 3 Jul 2018 16:34:46 +0200 Subject: [PATCH 143/304] major perf. improvement for csrmv --- library/src/level2/csrmv_device.h | 283 +++++++++---------------- library/src/level2/rocsparse_csrmv.hpp | 72 +++---- 2 files changed, 140 insertions(+), 215 deletions(-) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 79d5626b..97a327c7 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -1,218 +1,143 @@ -/* ************************************************************************ -* Copyright 2015 Vratis, Ltd. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* ************************************************************************ */ - #pragma once #ifndef CSRMV_DEVICE_H #define CSRMV_DEVICE_H #include -// Knuth's Two-Sum algorithm, which allows us to add together two floating -// point numbers and exactly tranform the answer into a sum and a -// rounding error. -// Inputs: x and y, the two inputs to be aded together. -// In/Out: *sumk_err, which is incremented (by reference) -- holds the -// error value as a result of the 2sum calculation. -// Returns: The non-corrected sum of inputs x and y. -template -static __device__ T two_sum(T x, T y, T* sumk_err) +#if defined(__HIP_PLATFORM_HCC__) +// Swizzle-based reduction +template +__device__ float reduction(float sum) { - const T sumk_s = x + y; -#ifdef EXTENDED_PRECISION - // We use this 2Sum algorithm to perform a compensated summation, - // which can reduce the cummulative rounding errors in our SpMV summation. - // Our compensated sumation is based on the SumK algorithm (with K==2) from - // Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in - // SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005. - - // 2Sum can be done in 6 FLOPs without a branch. However, calculating - // double precision is slower than single precision on every existing GPU. - // As such, replacing 2Sum with Fast2Sum when using DPFP results in slightly - // better performance. This is especially true on non-workstation GPUs with - // low DPFP rates. Fast2Sum is faster even though we must ensure that - // |a| > |b|. Branch divergence is better than the DPFP slowdown. - // Thus, for DPFP, our compensated summation algorithm is actually described - // by both Pichat and Neumaier in "Correction d'une somme en arithmetique - // a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972) - // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher - // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51, - // 1974), respectively. - if(fabs(x) < fabs(y)) + // clang-format off + if(SUBWAVE_SIZE > 32) sum += hc::__amdgcn_readlane(sum, 32); + if(SUBWAVE_SIZE > 16) sum += hc::__amdgcn_ds_swizzle(sum, 0x401f); + if(SUBWAVE_SIZE > 8) sum += hc::__amdgcn_ds_swizzle(sum, 0x201f); + if(SUBWAVE_SIZE > 4) sum += hc::__amdgcn_ds_swizzle(sum, 0x101f); + if(SUBWAVE_SIZE > 2) sum += hc::__amdgcn_ds_swizzle(sum, 0x081f); + if(SUBWAVE_SIZE > 1) sum += hc::__amdgcn_ds_swizzle(sum, 0x041f); + // clang-format on + + return sum; +} + +// Swizzle-based reduction +template +__device__ double reduction(double sum) +{ + typedef union dbl_b32 { - const T swap = x; + double val; + uint32_t b32[2]; + } dbl_b32_t; + + dbl_b32_t upper_sum; + dbl_b32_t temp_sum; - x = y; - y = swap; + temp_sum.val = sum; + + if(SUBWAVE_SIZE > 32) + { + upper_sum.b32[0] = hc::__amdgcn_readlane(temp_sum.b32[0], 32); + upper_sum.b32[1] = hc::__amdgcn_readlane(temp_sum.b32[1], 32); + temp_sum.val += upper_sum.val; } - (*sumk_err) += (y - (sumk_s - x)); -// Original 6 FLOP 2Sum algorithm. -// T bp = sumk_s - x; -// (*sumk_err) += ((x - (sumk_s - bp)) + (y - bp)); -#endif - return sumk_s; -} -// Performs (x_vals * x_vec) + y using an FMA. -// Ideally, we would perform an error-free transformation here and return the -// appropriate error. However, the EFT of an FMA is very expensive. As such, -// if we are in EXTENDED_PRECISION mode, this function devolves into two_sum -// with x_vals and x_vec inputs multiplied separately from the compensated add. -template -static __device__ T two_fma(T x_vals, T x_vec, T y, T* sumk_err) -{ -#ifdef EXTENDED_PRECISION - T x = x_vals * x_vec; + if(SUBWAVE_SIZE > 16) + { + upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x401f); + upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x401f); + temp_sum.val += upper_sum.val; + } - const T sumk_s = x + y; - if(fabs(x) < fabs(y)) + if(SUBWAVE_SIZE > 8) { - const T swap = x; + upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x201f); + upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x201f); + temp_sum.val += upper_sum.val; + } - x = y; - y = swap; + if(SUBWAVE_SIZE > 4) + { + upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x101f); + upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x101f); + temp_sum.val += upper_sum.val; } - (*sumk_err) += (y - (sumk_s - x)); - // 2Sum in the FMA case. Poor performance on low-DPFP GPUs. - // const T bp = fma(-x_vals, x_vec, sumk_s); - // (*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp)); - return sumk_s; -#else - return fma(x_vals, x_vec, y); -#endif -} -// A method of doing the final reduction without having to copy and paste -// it a bunch of times. -// The EXTENDED_PRECISION section is done as part of the PSum2 addition, -// where we take temporary sums and errors for multiple threads and combine -// them together using the same 2Sum method. -// Inputs: cur_sum: the input from which our sum starts -// err: the current running cascade error for this final summation -// partial: the local memory which holds the values to sum -// (we eventually use it to pass down temp. err vals as well) -// lid: local ID of the work item calling this function. -// thread_lane: The lane within this SUBWAVE for reduction. -// round: This parallel summation method operates in multiple rounds -// to do a parallel reduction. See the blow comment for usage. -template -static __device__ T -sum2_reduce(T cur_sum, T* err, volatile T* partial, int lid, int thread_lane, int round) + if(SUBWAVE_SIZE > 2) + { + upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x081f); + upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x081f); + temp_sum.val += upper_sum.val; + } + + if(SUBWAVE_SIZE > 1) + { + upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x041f); + upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x041f); + temp_sum.val += upper_sum.val; + } + + sum = temp_sum.val; + return sum; +} +#elif defined(__HIP_PLATFORM_NVCC__) +template +__device__ T reduction(T sum) { - if(SUBWAVE_SIZE > round) + for(rocsparse_int i = SUBWAVE_SIZE >> 1; i > 0; i >>= 1) { -#ifdef EXTENDED_PRECISION - const unsigned int partial_dest = lid + round; - if(thread_lane < round) - cur_sum = two_sum(cur_sum, partial[partial_dest], err); - // We reuse the LDS entries to move the error values down into lower - // threads. This saves LDS space, allowing higher occupancy, but requires - // more barriers, which can reduce performance. - __syncthreads(); - // Have all of those upper threads pass their temporary errors - // into a location that the lower threads can read. - if(thread_lane >= round) - partial[lid] = *err; - __syncthreads(); - if(thread_lane < round) - { // Add those errors in. - *err += partial[partial_dest]; - partial[lid] = cur_sum; - } -#else - // This is the more traditional reduction algorithm. It is up to - // 25% faster (about 10% on average -- potentially worse on devices - // with low double-precision calculation rates), but can result in - // numerical inaccuracies, especially in single precision. - cur_sum += partial[lid + round]; - __syncthreads(); - partial[lid] = cur_sum; -#endif + sum += __shfl_down_sync(0xffffffff, sum, i); } - return cur_sum; + + return sum; } +#endif -// Uses macro constants: -// WAVE_SIZE - "warp size", typically 64 (AMD) or 32 (NV) -// WG_SIZE - workgroup ("block") size, 1D representation assumed -// int - typename for the type of integer data read by the kernel, usually unsigned int -// T - typename for the type of floating point data, usually double -// SUBWAVE_SIZE - the length of a "sub-wave", a power of 2, i.e. 1,2,4,...,WAVE_SIZE, assigned to -// process a single matrix row -template -static __device__ void csrmvn_general_device(int num_rows, +template +static __device__ void csrmvn_general_device(rocsparse_int m, T alpha, - const int* row_offset, - const int* col, - const T* val, + const rocsparse_int* row_offset, + const rocsparse_int* csr_col_ind, + const T* csr_val, const T* x, T beta, T* y, rocsparse_index_base idx_base) { - __shared__ volatile T sdata[WG_SIZE + SUBWAVE_SIZE / 2]; + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int lid = tid & (SUBWAVE_SIZE - 1); + rocsparse_int nwarps = hipGridDim_x * hipBlockDim_x / SUBWAVE_SIZE; - // const int vectors_per_block = WG_SIZE/SUBWAVE_SIZE; - const int global_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // global workitem id - const int local_id = hipThreadIdx_x; // local workitem id - const int thread_lane = local_id & (SUBWAVE_SIZE - 1); - const int vector_id = global_id / SUBWAVE_SIZE; // global vector id - // const int vector_lane = local_id / SUBWAVE_SIZE; // vector id within the workgroup - const int num_vectors = hipGridDim_x * hipBlockDim_x / SUBWAVE_SIZE; - - for(int row = vector_id; row < num_rows; row += num_vectors) + // Loop over rows each subwave processes + for(rocsparse_int row = gid / SUBWAVE_SIZE; row < m; row += nwarps) { - const int row_start = row_offset[row] - idx_base; - const int row_end = row_offset[row + 1] - idx_base; - - T sum = 0.; - T sumk_e = 0.; - - // It is about 5% faster to always multiply by alpha, rather than to - // check whether alpha is 0, 1, or other and do different code paths. - for(int j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE) - sum = two_fma(alpha * val[j], x[col[j] - idx_base], sum, &sumk_e); - - T new_error = 0.; - sum = two_sum(sum, sumk_e, &new_error); - - // Parallel reduction in shared memory. - sdata[local_id] = sum; - -// This compensated summation reduces cummulative rounding errors, -// which can become a problem on GPUs because our reduction order is -// different than what would be used on a CPU. -// It is based on the PSumK algorithm (with K==2) from -// Yamanaka, Ogita, Rump, and Oishi, "A Parallel Algorithm of -// Accurate Dot Product," in the Journal of Parallel Computing, -// 34(6-8), pp. 392-410, Jul. 2008. -#pragma unroll - for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + // Each subwave processes one row + rocsparse_int row_start = row_offset[row] - idx_base; + rocsparse_int row_end = row_offset[row + 1] - idx_base; + + T sum = 0.0; + + // Loop over non-zero elements of subwave row + for(rocsparse_int j = row_start + lid; j < row_end; j += SUBWAVE_SIZE) { - __syncthreads(); - sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, i); + sum = fma(alpha * csr_val[j], __ldg(x + csr_col_ind[j] - idx_base), sum); } - if(thread_lane == 0) + // Obtain row sum using parallel reduction + sum = reduction(sum); + + // First thread of each subwave writes result into global memory + if(lid == 0) { if(beta == 0) - y[row] = sum + new_error; + { + y[row] = sum; + } else { - sum = two_fma(beta, y[row], sum, &new_error); - y[row] = sum + new_error; + y[row] = fma(beta, y[row], sum); } } } diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index f2ad0819..063aade9 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -13,33 +13,33 @@ #include -template +template __global__ void csrmvn_kernel_host_pointer(rocsparse_int m, T alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* x, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, T beta, - T* y, + T* __restrict__ y, rocsparse_index_base idx_base) { - csrmvn_general_device( + csrmvn_general_device( m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); } -template +template __global__ void csrmvn_kernel_device_pointer(rocsparse_int m, const T* alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* x, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, const T* beta, - T* y, + T* __restrict__ y, rocsparse_index_base idx_base) { - csrmvn_general_device( + csrmvn_general_device( m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); } @@ -183,7 +183,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { if(nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -200,7 +200,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -217,7 +217,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -234,7 +234,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -251,7 +251,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -271,7 +271,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { if(nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -288,7 +288,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -305,7 +305,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -322,7 +322,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -339,7 +339,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 64) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -356,7 +356,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -388,7 +388,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { if(nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -405,7 +405,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -422,7 +422,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -439,7 +439,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -456,7 +456,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -476,7 +476,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { if(nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -493,7 +493,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -510,7 +510,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -527,7 +527,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -544,7 +544,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 64) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -561,7 +561,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, From 0c9d5ca06c6c3a6ee306ee3a47952d141e8bf97b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 3 Jul 2018 18:54:49 +0200 Subject: [PATCH 144/304] int -> rocsparse_int --- library/src/conversion/csrsort_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/conversion/csrsort_device.h b/library/src/conversion/csrsort_device.h index 8a6e6433..815579f7 100644 --- a/library/src/conversion/csrsort_device.h +++ b/library/src/conversion/csrsort_device.h @@ -12,7 +12,7 @@ __global__ void csrsort_shift_kernel(rocsparse_int size, const rocsparse_int* in, rocsparse_int* out) { - int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if(gid >= size) { From 97b31864164d9da0038f39d2ef94039ade249e8c Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 4 Jul 2018 08:19:25 +0200 Subject: [PATCH 145/304] workaround while there is no hc::__amdgcn_readlane() --- library/src/level2/csrmv_device.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 97a327c7..ddfaf2af 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -10,7 +10,14 @@ template __device__ float reduction(float sum) { // clang-format off - if(SUBWAVE_SIZE > 32) sum += hc::__amdgcn_readlane(sum, 32); +// if(SUBWAVE_SIZE > 32) sum += hc::__amdgcn_readlane(sum, 32); + // TODO: readlane is not ported to HC + if(SUBWAVE_SIZE > 32) + { + float val = 0.0f; + __asm__ volatile("v_readlane_b32 %0 %1 32" : "=s"(val) : "v"(sum)); + sum += val; + } if(SUBWAVE_SIZE > 16) sum += hc::__amdgcn_ds_swizzle(sum, 0x401f); if(SUBWAVE_SIZE > 8) sum += hc::__amdgcn_ds_swizzle(sum, 0x201f); if(SUBWAVE_SIZE > 4) sum += hc::__amdgcn_ds_swizzle(sum, 0x101f); @@ -38,8 +45,11 @@ __device__ double reduction(double sum) if(SUBWAVE_SIZE > 32) { - upper_sum.b32[0] = hc::__amdgcn_readlane(temp_sum.b32[0], 32); - upper_sum.b32[1] = hc::__amdgcn_readlane(temp_sum.b32[1], 32); +// upper_sum.b32[0] = hc::__amdgcn_readlane(temp_sum.b32[0], 32); +// upper_sum.b32[1] = hc::__amdgcn_readlane(temp_sum.b32[1], 32); + // TODO readlane is not ported to HC + __asm__ volatile("v_readlane_b32 %0 %1 32" : "=s"(upper_sum.b32[0]) : "v"(temp_sum.b32[0])); + __asm__ volatile("v_readlane_b32 %0 %1 32" : "=s"(upper_sum.b32[1]) : "v"(temp_sum.b32[1])); temp_sum.val += upper_sum.val; } From bc52e9cf46f1c6f92e3b72e31b79cef4c3ddc11a Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 4 Jul 2018 08:25:56 +0200 Subject: [PATCH 146/304] clang-format --- clients/include/testing_coo2csr.hpp | 5 +-- library/src/level2/csrmv_device.h | 10 +++--- library/src/level2/rocsparse_coomv.hpp | 48 +++++++++++++------------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/clients/include/testing_coo2csr.hpp b/clients/include/testing_coo2csr.hpp index 1d33543c..a160c7ac 100644 --- a/clients/include/testing_coo2csr.hpp +++ b/clients/include/testing_coo2csr.hpp @@ -128,7 +128,7 @@ rocsparse_status testing_coo2csr(Arguments argus) { std::vector hptr(m + 1); m = n = gen_2d_laplacian(argus.laplacian, hptr, hcoo_col_ind, hcoo_val, idx_base); - nnz = hptr[m]; + nnz = hptr[m]; hcoo_row_ind.resize(nnz); // Convert to COO @@ -144,7 +144,8 @@ rocsparse_status testing_coo2csr(Arguments argus) { if(argus.filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val) != 0) + if(read_mtx_matrix( + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index ddfaf2af..4659201d 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -45,8 +45,8 @@ __device__ double reduction(double sum) if(SUBWAVE_SIZE > 32) { -// upper_sum.b32[0] = hc::__amdgcn_readlane(temp_sum.b32[0], 32); -// upper_sum.b32[1] = hc::__amdgcn_readlane(temp_sum.b32[1], 32); + // upper_sum.b32[0] = hc::__amdgcn_readlane(temp_sum.b32[0], 32); + // upper_sum.b32[1] = hc::__amdgcn_readlane(temp_sum.b32[1], 32); // TODO readlane is not ported to HC __asm__ volatile("v_readlane_b32 %0 %1 32" : "=s"(upper_sum.b32[0]) : "v"(temp_sum.b32[0])); __asm__ volatile("v_readlane_b32 %0 %1 32" : "=s"(upper_sum.b32[1]) : "v"(temp_sum.b32[1])); @@ -115,9 +115,9 @@ static __device__ void csrmvn_general_device(rocsparse_int m, T* y, rocsparse_index_base idx_base) { - rocsparse_int tid = hipThreadIdx_x; - rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; - rocsparse_int lid = tid & (SUBWAVE_SIZE - 1); + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int lid = tid & (SUBWAVE_SIZE - 1); rocsparse_int nwarps = hipGridDim_x * hipBlockDim_x / SUBWAVE_SIZE; // Loop over rows each subwave processes diff --git a/library/src/level2/rocsparse_coomv.hpp b/library/src/level2/rocsparse_coomv.hpp index 8a6689b1..980579d4 100644 --- a/library/src/level2/rocsparse_coomv.hpp +++ b/library/src/level2/rocsparse_coomv.hpp @@ -15,18 +15,18 @@ #include template -__launch_bounds__(128) -__global__ void coomvn_warp_host_pointer(rocsparse_int nnz, - rocsparse_int loops, - T alpha, - const rocsparse_int* __restrict__ coo_row_ind, - const rocsparse_int* __restrict__ coo_col_ind, - const T* __restrict__ coo_val, - const T* __restrict__ x, - T* __restrict__ y, - rocsparse_int* __restrict__ row_block_red, - T* __restrict__ val_block_red, - rocsparse_index_base idx_base) +__launch_bounds__(128) __global__ + void coomvn_warp_host_pointer(rocsparse_int nnz, + rocsparse_int loops, + T alpha, + const rocsparse_int* __restrict__ coo_row_ind, + const rocsparse_int* __restrict__ coo_col_ind, + const T* __restrict__ coo_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ row_block_red, + T* __restrict__ val_block_red, + rocsparse_index_base idx_base) { coomvn_general_warp_reduce(nnz, loops, @@ -42,18 +42,18 @@ __global__ void coomvn_warp_host_pointer(rocsparse_int nnz, } template -__launch_bounds__(128) -__global__ void coomvn_warp_device_pointer(rocsparse_int nnz, - rocsparse_int loops, - const T* alpha, - const rocsparse_int* __restrict__ coo_row_ind, - const rocsparse_int* __restrict__ coo_col_ind, - const T* __restrict__ coo_val, - const T* __restrict__ x, - T* __restrict__ y, - rocsparse_int* __restrict__ row_block_red, - T* __restrict__ val_block_red, - rocsparse_index_base idx_base) +__launch_bounds__(128) __global__ + void coomvn_warp_device_pointer(rocsparse_int nnz, + rocsparse_int loops, + const T* alpha, + const rocsparse_int* __restrict__ coo_row_ind, + const rocsparse_int* __restrict__ coo_col_ind, + const T* __restrict__ coo_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ row_block_red, + T* __restrict__ val_block_red, + rocsparse_index_base idx_base) { coomvn_general_warp_reduce(nnz, loops, From b227205e924579bf829531f9a627a87d2014df12 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 5 Jul 2018 10:51:27 +0200 Subject: [PATCH 147/304] changed hc:: intrinsics to llvm --- library/src/level2/csrmv_device.h | 52 ++++++++++++++----------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 4659201d..8b0c672c 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -4,25 +4,24 @@ #include +#if defined(__HIP_PLATFORM_HCC__) +// While HIP does not contain llvm intrinsics +__device__ int __llvm_amdgcn_ds_swizzle(int index, int pattern) __asm("llvm.amdgcn.ds.swizzle"); +__device__ int __llvm_amdgcn_readlane(int index, int offset) __asm("llvm.amdgcn.readlane"); +#endif + #if defined(__HIP_PLATFORM_HCC__) // Swizzle-based reduction template __device__ float reduction(float sum) { // clang-format off -// if(SUBWAVE_SIZE > 32) sum += hc::__amdgcn_readlane(sum, 32); - // TODO: readlane is not ported to HC - if(SUBWAVE_SIZE > 32) - { - float val = 0.0f; - __asm__ volatile("v_readlane_b32 %0 %1 32" : "=s"(val) : "v"(sum)); - sum += val; - } - if(SUBWAVE_SIZE > 16) sum += hc::__amdgcn_ds_swizzle(sum, 0x401f); - if(SUBWAVE_SIZE > 8) sum += hc::__amdgcn_ds_swizzle(sum, 0x201f); - if(SUBWAVE_SIZE > 4) sum += hc::__amdgcn_ds_swizzle(sum, 0x101f); - if(SUBWAVE_SIZE > 2) sum += hc::__amdgcn_ds_swizzle(sum, 0x081f); - if(SUBWAVE_SIZE > 1) sum += hc::__amdgcn_ds_swizzle(sum, 0x041f); + if(SUBWAVE_SIZE > 32) sum += __llvm_amdgcn_readlane(sum, 32); + if(SUBWAVE_SIZE > 16) sum += __llvm_amdgcn_ds_swizzle(sum, 0x401f); + if(SUBWAVE_SIZE > 8) sum += __llvm_amdgcn_ds_swizzle(sum, 0x201f); + if(SUBWAVE_SIZE > 4) sum += __llvm_amdgcn_ds_swizzle(sum, 0x101f); + if(SUBWAVE_SIZE > 2) sum += __llvm_amdgcn_ds_swizzle(sum, 0x081f); + if(SUBWAVE_SIZE > 1) sum += __llvm_amdgcn_ds_swizzle(sum, 0x041f); // clang-format on return sum; @@ -45,46 +44,43 @@ __device__ double reduction(double sum) if(SUBWAVE_SIZE > 32) { - // upper_sum.b32[0] = hc::__amdgcn_readlane(temp_sum.b32[0], 32); - // upper_sum.b32[1] = hc::__amdgcn_readlane(temp_sum.b32[1], 32); - // TODO readlane is not ported to HC - __asm__ volatile("v_readlane_b32 %0 %1 32" : "=s"(upper_sum.b32[0]) : "v"(temp_sum.b32[0])); - __asm__ volatile("v_readlane_b32 %0 %1 32" : "=s"(upper_sum.b32[1]) : "v"(temp_sum.b32[1])); + upper_sum.b32[0] = __llvm_amdgcn_readlane(temp_sum.b32[0], 32); + upper_sum.b32[1] = __llvm_amdgcn_readlane(temp_sum.b32[1], 32); temp_sum.val += upper_sum.val; } if(SUBWAVE_SIZE > 16) { - upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x401f); - upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x401f); + upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x401f); + upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x401f); temp_sum.val += upper_sum.val; } if(SUBWAVE_SIZE > 8) { - upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x201f); - upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x201f); + upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x201f); + upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x201f); temp_sum.val += upper_sum.val; } if(SUBWAVE_SIZE > 4) { - upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x101f); - upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x101f); + upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x101f); + upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x101f); temp_sum.val += upper_sum.val; } if(SUBWAVE_SIZE > 2) { - upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x081f); - upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x081f); + upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x081f); + upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x081f); temp_sum.val += upper_sum.val; } if(SUBWAVE_SIZE > 1) { - upper_sum.b32[0] = hc::__amdgcn_ds_swizzle(temp_sum.b32[0], 0x041f); - upper_sum.b32[1] = hc::__amdgcn_ds_swizzle(temp_sum.b32[1], 0x041f); + upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x041f); + upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x041f); temp_sum.val += upper_sum.val; } From 76d69af94cbd02e0267ee3c648d025dfc5e3cc2c Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 10 Jul 2018 10:44:16 +0200 Subject: [PATCH 148/304] changed binsearch from recursiv to iterative (slightly faster) --- library/src/conversion/coo2csr_device.h | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/library/src/conversion/coo2csr_device.h b/library/src/conversion/coo2csr_device.h index 676a7174..c27fa48e 100644 --- a/library/src/conversion/coo2csr_device.h +++ b/library/src/conversion/coo2csr_device.h @@ -14,22 +14,21 @@ static inline __device__ rocsparse_int lower_bound(const rocsparse_int* arr, rocsparse_int low, rocsparse_int high) { - if(low > high) + while(low < high) { - return low; + rocsparse_int mid = low + ((high - low) >> 1); + + if(arr[mid] < key) + { + low = mid + 1; + } + else + { + high = mid; + } } - rocsparse_int mid = low + ((high - low) >> 1); - - if(arr[mid] >= key) - { - high = mid - 1; - } - else - { - low = mid + 1; - } - return lower_bound(arr, key, low, high); + return low; } // COO to CSR matrix conversion kernel From 318210882199073b3590b0f2434703f1b90684dc Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 12 Jul 2018 16:13:57 +0200 Subject: [PATCH 149/304] added csr2csc (== csr/csc matrix transpose) --- library/include/rocsparse-functions.h | 109 ++++++++++ library/include/rocsparse-types.h | 6 + library/src/CMakeLists.txt | 1 + library/src/conversion/csr2csc_device.h | 30 +++ library/src/conversion/rocsparse_csr2csc.cpp | 149 ++++++++++++++ library/src/conversion/rocsparse_csr2csc.hpp | 198 +++++++++++++++++++ 6 files changed, 493 insertions(+) create mode 100644 library/src/conversion/csr2csc_device.h create mode 100644 library/src/conversion/rocsparse_csr2csc.cpp create mode 100644 library/src/conversion/rocsparse_csr2csc.hpp diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index f5b8cb8b..0fa664a2 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -841,6 +841,115 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, rocsparse_int* coo_row_ind, rocsparse_index_base idx_base); +/*! \brief SPARSE Format Conversions API + + \details + csr2csc_buffer_size returns the size of the temporary storage buffer + that is required by csr2csc. The temporary storage buffer has to be + allocated by the user. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero elements of A. + @param[in] + csr_row_ptr array of m+1 elements that point to the start + of every row of A. + @param[in] + csr_col_ind array of nnz elements containing the column indices + of A. + @param[in] + copy_values rocsparse_action_symbolic or rocsparse_action_numeric + @param[out] + buffer_size number of bytes of the temporary storage buffer. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csr2csc_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_action copy_values, + size_t* buffer_size); + +/*! \brief SPARSE Format Conversions API + + \details + csr2csc converts a CSR matrix into a CSC matrix. The resulting matrix + can also be seen as the transpose of the original CSR matrix. csr2csc + can also be used to convert a CSC matrix into a CSR matrix. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of the sparse matrix. + @param[in] + n number of columns of the sparse matrix. + @param[in] + nnz number of non-zero entries of the sparse matrix. + @param[in] + csr_val array of nnz elements of the CSR matrix. + @param[in] + csr_row_ptr array of m+1 elements that point to the start of + every row of the CSR matrix. + @param[in] + csr_col_ind array of nnz elements containing the column indices + of the CSR matrix. + @param[out] + csc_val array of nnz elements of the CSC matrix. + @param[out] + csc_row_ind array of nnz elements containing the row indices of + the CSC matrix. + @param[out] + csc_col_ptr array of n+1 elements that point to the start of + every column of the CSC matrix. + @param[in] + copy_values rocsparse_action_symbolic or rocsparse_action_numeric + @param[in] + idx_base rocsparse_index_base_zero or rocsparse_index_base_one. + @param[in] + temp_buffer temporary storage buffer allocated by the user, + size is returned by csr2csc_buffer_size + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsr2csc(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + float* csc_val, + rocsparse_int* csc_row_ind, + rocsparse_int* csc_col_ptr, + rocsparse_action copy_values, + rocsparse_index_base idx_base, + void* temp_buffer); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsr2csc(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + double* csc_val, + rocsparse_int* csc_row_ind, + rocsparse_int* csc_col_ptr, + rocsparse_action copy_values, + rocsparse_index_base idx_base, + void* temp_buffer); + /*! \brief SPARSE Format Conversions API \details diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index 1aabae39..85cf5169 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -53,6 +53,12 @@ typedef enum rocsparse_matrix_type_ { rocsparse_matrix_type_triangular = 3 /**< triangular matrix type. */ } rocsparse_matrix_type; +/*! \brief Used to specify where the operation is performed on. */ +typedef enum rocsparse_action_ { + rocsparse_action_symbolic = 0, /**< Operate only on indices. */ + rocsparse_action_numeric = 1 /**< Operate on data and indices. */ +} rocsparse_action; + /*! \brief HYB matrix partition type. */ typedef enum rocsparse_hyb_partition_ { rocsparse_hyb_partition_auto = 0, /**< automatically decide on ELL nnz per row. */ diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 508b0ae9..cc84e6af 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -25,6 +25,7 @@ set(rocsparse_source # Conversion src/conversion/rocsparse_csr2coo.cpp + src/conversion/rocsparse_csr2csc.cpp src/conversion/rocsparse_csr2ell.cpp src/conversion/rocsparse_csr2hyb.cpp src/conversion/rocsparse_coo2csr.cpp diff --git a/library/src/conversion/csr2csc_device.h b/library/src/conversion/csr2csc_device.h new file mode 100644 index 00000000..209480ca --- /dev/null +++ b/library/src/conversion/csr2csc_device.h @@ -0,0 +1,30 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef CSR2CSC_DEVICE_H +#define CSR2CSC_DEVICE_H + +#include + +template +__global__ void csr2csc_permute_kernel(rocsparse_int nnz, + const rocsparse_int* in1, + const T* in2, + const rocsparse_int* map, + rocsparse_int* out1, + T* out2) +{ + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(gid >= nnz) + { + return; + } + + out1[gid] = in1[map[gid]]; + out2[gid] = in2[map[gid]]; +} + +#endif // CSR2CSC_DEVICE_H diff --git a/library/src/conversion/rocsparse_csr2csc.cpp b/library/src/conversion/rocsparse_csr2csc.cpp new file mode 100644 index 00000000..cf62f5e0 --- /dev/null +++ b/library/src/conversion/rocsparse_csr2csc.cpp @@ -0,0 +1,149 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_csr2csc.hpp" + +#include +#include + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_csr2csc_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_action copy_values, + size_t* buffer_size) +{ + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_csr2csc_buffer_size", + m, + n, + nnz, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + copy_values, + (const void*&)buffer_size); + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(buffer_size == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + *buffer_size = 0; + return rocsparse_status_success; + } + + hipStream_t stream = handle->stream; + + // Determine hipcub buffer size + rocsparse_int* ptr = reinterpret_cast(buffer_size); + hipcub::DoubleBuffer dummy(ptr, ptr); + + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, *buffer_size, dummy, dummy, nnz, 0, 32, stream)); + + // rocPRIM does not support in-place sorting, so we need additional buffer + // for all temporary arrays + *buffer_size += sizeof(rocsparse_int) * nnz * 3; + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_scsr2csc(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + float* csc_val, + rocsparse_int* csc_row_ind, + rocsparse_int* csc_col_ptr, + rocsparse_action copy_values, + rocsparse_index_base idx_base, + void* temp_buffer) +{ + return rocsparse_csr2csc_template(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr, + copy_values, + idx_base, + temp_buffer); +} + +extern "C" rocsparse_status rocsparse_dcsr2csc(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + double* csc_val, + rocsparse_int* csc_row_ind, + rocsparse_int* csc_col_ptr, + rocsparse_action copy_values, + rocsparse_index_base idx_base, + void* temp_buffer) +{ + return rocsparse_csr2csc_template(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr, + copy_values, + idx_base, + temp_buffer); +} diff --git a/library/src/conversion/rocsparse_csr2csc.hpp b/library/src/conversion/rocsparse_csr2csc.hpp new file mode 100644 index 00000000..06392068 --- /dev/null +++ b/library/src/conversion/rocsparse_csr2csc.hpp @@ -0,0 +1,198 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_CSR2CSC_HPP +#define ROCSPARSE_CSR2CSC_HPP + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "csr2csc_device.h" + +#include +#include + +template +rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + T* csc_val, + rocsparse_int* csc_row_ind, + rocsparse_int* csc_col_ptr, + rocsparse_action copy_values, + rocsparse_index_base idx_base, + void* temp_buffer) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging TODO bench logging + log_trace(handle, + replaceX("rocsparse_Xcsr2csc"), + m, + n, + nnz, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)csc_val, + (const void*&)csc_row_ind, + (const void*&)csc_col_ptr, + copy_values, + idx_base, + (const void*&)temp_buffer); + + // Check index base + if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check sizes + if(m < 0 || n < 0 || nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_val == nullptr && rocsparse_action_numeric) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csc_val == nullptr && copy_values == rocsparse_action_numeric) + { + return rocsparse_status_invalid_pointer; + } + else if(csc_row_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csc_col_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(temp_buffer == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + unsigned int startbit = 0; + unsigned int endbit = rocsparse_clz(n); + + // Temporary buffer entry points + char* ptr = reinterpret_cast(temp_buffer); + + // work1 buffer + rocsparse_int* tmp_work1 = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * nnz; + + // work2 buffer + rocsparse_int* tmp_work2 = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * nnz; + + // perm buffer + rocsparse_int* tmp_perm = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * nnz; + + // hipcub buffer + void* tmp_hipcub = reinterpret_cast(ptr); + + // Load CSR column indices into work1 buffer + RETURN_IF_HIP_ERROR(hipMemcpy(tmp_work1, csr_col_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + + if(copy_values == rocsparse_action_symbolic) + { + // action symbolic + + // Create row indices + RETURN_IF_ROCSPARSE_ERROR(rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, csc_row_ind, idx_base)); + // Stable sort COO by columns + hipcub::DoubleBuffer keys(tmp_work1, tmp_perm); + hipcub::DoubleBuffer vals(csc_row_ind, tmp_work2); + + size_t size = 0; + + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(tmp_hipcub, size, keys, vals, nnz, startbit, endbit, stream)); + + // Create column pointers + RETURN_IF_ROCSPARSE_ERROR(rocsparse_coo2csr(handle, keys.Current(), nnz, n, csc_col_ptr, idx_base)); + + // Copy csc_row_ind if not current + if(vals.Current() != csc_row_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy(csc_row_ind, vals.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + } + else + { + // action numeric + + // Create identitiy permutation + RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_identity_permutation(handle, nnz, tmp_perm)); + + // Stable sort COO by columns + hipcub::DoubleBuffer keys(tmp_work1, csc_row_ind); + hipcub::DoubleBuffer vals(tmp_perm, tmp_work2); + + size_t size = 0; + + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(tmp_hipcub, size, keys, vals, nnz, startbit, endbit, stream)); + + // Create column pointers + RETURN_IF_ROCSPARSE_ERROR(rocsparse_coo2csr(handle, keys.Current(), nnz, n, csc_col_ptr, idx_base)); + + // Create row indices + RETURN_IF_ROCSPARSE_ERROR(rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, tmp_work1, idx_base)); + + // Permute row indices and values +#define CSR2CSC_DIM 512 + dim3 csr2csc_blocks((nnz - 1) / CSR2CSC_DIM + 1); + dim3 csr2csc_threads(CSR2CSC_DIM); + + hipLaunchKernelGGL((csr2csc_permute_kernel), + csr2csc_blocks, + csr2csc_threads, + 0, + stream, + nnz, + tmp_work1, + csr_val, + vals.Current(), + csc_row_ind, + csc_val); +#undef CSR2CSC_DIM + } + + return rocsparse_status_success; +} + +#endif // ROCSPARSE_CSR2CSC_HPP From eb140d8f6b4735f1bc7a519d65e56b88453dadce Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 12 Jul 2018 16:15:15 +0200 Subject: [PATCH 150/304] fixed typo --- library/src/conversion/rocsparse_csrsort.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 4f34e76a..e53650da 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -221,7 +221,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, #elif defined(__HIP_PLATFORM_NVCC__) hipcub::DoubleBuffer dummy(csr_col_ind, csr_col_ind); - ETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys( + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys( nullptr, size, dummy, nnz, m, csr_row_ptr, csr_row_ptr + 1, startbit, endbit, stream)); #endif } From fa32f796ae2c06d43b3a9303629732ddce1a0695 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 12 Jul 2018 16:17:17 +0200 Subject: [PATCH 151/304] csr2csc tests and benchmark --- clients/benchmarks/client.cpp | 11 +- .../rocsparse_template_specialization.cpp | 36 ++ clients/include/rocsparse.hpp | 15 + clients/include/testing_csr2csc.hpp | 558 ++++++++++++++++++ clients/tests/CMakeLists.txt | 1 + clients/tests/test_csr2csc.cpp | 65 ++ 6 files changed, 685 insertions(+), 1 deletion(-) create mode 100644 clients/include/testing_csr2csc.hpp create mode 100644 clients/tests/test_csr2csc.cpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index cf8d4737..e6d157ed 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -21,6 +21,7 @@ // Conversion #include "testing_csr2coo.hpp" +#include "testing_csr2csc.hpp" #include "testing_csr2ell.hpp" #include "testing_csr2hyb.hpp" #include "testing_coo2csr.hpp" @@ -83,7 +84,8 @@ int main(int argc, char* argv[]) "SPARSE function to test. Options:\n" " Level1: axpyi, doti, gthr, gthrz, roti, sctr\n" " Level2: coomv, csrmv, ellmv, hybmv\n" - " Conversion: csr2coo, csr2ell, csr2hyb, coo2csr\n" + " Conversion: csr2coo, csr2csc, csr2ell,\n" + " csr2hyb, coo2csr\n" " Sorting: csrsort") ("precision,r", @@ -214,6 +216,13 @@ int main(int argc, char* argv[]) { testing_csr2coo(argus); } + else if(function == "csr2csc") + { + if(precision == 's') + testing_csr2csc(argus); + else if(precision == 'd') + testing_csr2csc(argus); + } else if(function == "csr2ell") { if(precision == 's') diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 3147d7b7..270b3f8d 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -286,6 +286,42 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, return rocsparse_dhybmv(handle, trans, alpha, descr, hyb, x, beta, y); } +template <> +rocsparse_status rocsparse_csr2csc(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + float* csc_val, + rocsparse_int* csc_row_ind, + rocsparse_int* csc_col_ptr, + rocsparse_action copy_values, + rocsparse_index_base idx_base, + void* temp_buffer) +{ + return rocsparse_scsr2csc(handle, m, n, nnz, csr_val, csr_row_ptr, csr_col_ind, csc_val, csc_row_ind, csc_col_ptr, copy_values, idx_base, temp_buffer); +} + +template <> +rocsparse_status rocsparse_csr2csc(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + double* csc_val, + rocsparse_int* csc_row_ind, + rocsparse_int* csc_col_ptr, + rocsparse_action copy_values, + rocsparse_index_base idx_base, + void* temp_buffer) +{ + return rocsparse_dcsr2csc(handle, m, n, nnz, csr_val, csr_row_ptr, csr_col_ind, csc_val, csc_row_ind, csc_col_ptr, copy_values, idx_base, temp_buffer); +} + template <> rocsparse_status rocsparse_csr2ell(rocsparse_handle handle, rocsparse_int m, diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index c24279c8..a74400b5 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -115,6 +115,21 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, const T* beta, T* y); +template +rocsparse_status rocsparse_csr2csc(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + T* csc_val, + rocsparse_int* csc_row_ind, + rocsparse_int* csc_col_ptr, + rocsparse_action copy_values, + rocsparse_index_base idx_base, + void* temp_buffer); + template rocsparse_status rocsparse_csr2ell(rocsparse_handle handle, rocsparse_int m, diff --git a/clients/include/testing_csr2csc.hpp b/clients/include/testing_csr2csc.hpp new file mode 100644 index 00000000..8d290840 --- /dev/null +++ b/clients/include/testing_csr2csc.hpp @@ -0,0 +1,558 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_CSR2CSC_HPP +#define TESTING_CSR2CSC_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_csr2csc_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int n = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + rocsparse_status status; + + size_t size = 0; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto csc_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csc_col_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csc_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto buffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); + rocsparse_int* csc_row_ind = (rocsparse_int*)csc_row_ind_managed.get(); + rocsparse_int* csc_col_ptr = (rocsparse_int*)csc_col_ptr_managed.get(); + T* csc_val = (T*)csc_val_managed.get(); + void* buffer = (void*)buffer_managed.get(); + + if(!csr_row_ptr || !csr_col_ind || !csr_val || !csc_row_ind || !csc_col_ptr || !csc_val || !buffer) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // Testing rocsparse_csr2csc_buffer_size() + + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_csr2csc_buffer_size(handle, + m, + n, + nnz, + csr_row_ptr_null, + csr_col_ind, + rocsparse_action_numeric, + &size); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + + // Testing for (csr_col_ind == nullptr) + { + rocsparse_int* csr_col_ind_null = nullptr; + + status = rocsparse_csr2csc_buffer_size(handle, + m, + n, + nnz, + csr_row_ptr, + csr_col_ind_null, + rocsparse_action_numeric, + &size); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); + } + + // Testing for (buffer_size == nullptr) + { + size_t* buffer_size_null = nullptr; + + status = rocsparse_csr2csc_buffer_size(handle, + m, + n, + nnz, + csr_row_ptr, + csr_col_ind, + rocsparse_action_numeric, + buffer_size_null); + verify_rocsparse_status_invalid_pointer(status, "Error: buffer_size is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csr2csc_buffer_size(handle_null, + m, + n, + nnz, + csr_row_ptr, + csr_col_ind, + rocsparse_action_numeric, + &size); + verify_rocsparse_status_invalid_handle(status); + } + + // Testing rocsparse_csr2csc() + + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_csr2csc(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr_null, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr, + rocsparse_action_numeric, + rocsparse_index_base_zero, + buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + + // Testing for (csr_col_ind == nullptr) + { + rocsparse_int* csr_col_ind_null = nullptr; + + status = rocsparse_csr2csc(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind_null, + csc_val, + csc_row_ind, + csc_col_ptr, + rocsparse_action_numeric, + rocsparse_index_base_zero, + buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); + } + + // Testing for (csr_val == nullptr) + { + T* csr_val_null = nullptr; + + status = rocsparse_csr2csc(handle, + m, + n, + nnz, + csr_val_null, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr, + rocsparse_action_numeric, + rocsparse_index_base_zero, + buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_val is nullptr"); + } + + // Testing for (csc_row_ind == nullptr) + { + rocsparse_int* csc_row_ind_null = nullptr; + + status = rocsparse_csr2csc(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind_null, + csc_col_ptr, + rocsparse_action_numeric, + rocsparse_index_base_zero, + buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: csc_row_ind is nullptr"); + } + + // Testing for (csc_col_ptr == nullptr) + { + rocsparse_int* csc_col_ptr_null = nullptr; + + status = rocsparse_csr2csc(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr_null, + rocsparse_action_numeric, + rocsparse_index_base_zero, + buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: csc_col_ptr is nullptr"); + } + + // Testing for (csc_val == nullptr) + { + T* csc_val_null = nullptr; + + status = rocsparse_csr2csc(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val_null, + csc_row_ind, + csc_col_ptr, + rocsparse_action_numeric, + rocsparse_index_base_zero, + buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: csc_val is nullptr"); + } + + // Testing for (buffer == nullptr) + { + void* buffer_null = nullptr; + + status = rocsparse_csr2csc(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr, + rocsparse_action_numeric, + rocsparse_index_base_zero, + buffer_null); + verify_rocsparse_status_invalid_pointer(status, "Error: buffer is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csr2csc(handle_null, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr, + rocsparse_action_numeric, + rocsparse_index_base_zero, + buffer); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_csr2csc(Arguments argus) +{ + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_action action = argus.action; + rocsparse_status status; + + size_t size = 0; + + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto csr_row_ptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto csc_row_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csc_col_ptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csc_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto buffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); + rocsparse_int* csc_row_ind = (rocsparse_int*)csc_row_ind_managed.get(); + rocsparse_int* csc_col_ptr = (rocsparse_int*)csc_col_ptr_managed.get(); + T* csc_val = (T*)csc_val_managed.get(); + void* buffer = (void*)buffer_managed.get(); + + if(!csr_row_ptr || !csr_col_ind || !csr_val || !csc_row_ind || !csc_col_ptr || !csc_val || !buffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!csr_row_ptr || !csr_col_ind || !csr_val || " + "!csc_row_ind || !csc_col_ptr || !csc_val || !buffer"); + return rocsparse_status_memory_error; + } + + status = rocsparse_csr2csc_buffer_size( + handle, m, n, nnz, csr_row_ptr, csr_col_ind, action, &size); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + + // Buffer size should be zero + size_t zero = 0; + unit_check_general(1, 1, &zero, &size); + } + + status = rocsparse_csr2csc(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr, + action, + idx_base, + buffer); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + } + + return rocsparse_status_success; + } + + // Host structures + std::vector hcsr_row_ptr; + std::vector hcsr_col_ind; + std::vector hcsr_val; + + // Sample initial COO matrix on CPU + srand(12345ULL); + if(argus.laplacian) + { + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base); + nnz = hcsr_row_ptr[m]; + } + else + { + std::vector hcoo_row_ind; + + if(argus.filename != "") + { + if(read_mtx_matrix( + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base); + } + + // Convert COO to CSR + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } + } + + // Allocate memory on the device + auto dcsr_row_ptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcsr_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcsr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dcsc_row_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcsc_col_ptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (n + 1)), device_free}; + auto dcsc_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + + rocsparse_int* dcsr_row_ptr = (rocsparse_int*)dcsr_row_ptr_managed.get(); + rocsparse_int* dcsr_col_ind = (rocsparse_int*)dcsr_col_ind_managed.get(); + T* dcsr_val = (T*)dcsr_val_managed.get(); + rocsparse_int* dcsc_row_ind = (rocsparse_int*)dcsc_row_ind_managed.get(); + rocsparse_int* dcsc_col_ptr = (rocsparse_int*)dcsc_col_ptr_managed.get(); + T* dcsc_val = (T*)dcsc_val_managed.get(); + + if(!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val || !dcsc_row_ind || !dcsc_col_ptr || !dcsc_val) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val || " + "!dcsc_row_ind || !dcsc_col_ptr || !dcsc_val"); + return rocsparse_status_memory_error; + } + + // Reset CSC arrays + CHECK_HIP_ERROR(hipMemset(dcsc_row_ind, 0, sizeof(rocsparse_int) * nnz)); + CHECK_HIP_ERROR(hipMemset(dcsc_col_ptr, 0, sizeof(rocsparse_int) * (n + 1))); + CHECK_HIP_ERROR(hipMemset(dcsc_val, 0, sizeof(T) * nnz)); + + // Copy data from host to device + CHECK_HIP_ERROR(hipMemcpy(dcsr_row_ptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_col_ind, hcsr_col_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_val, hcsr_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + + // Obtain buffer size + CHECK_ROCSPARSE_ERROR(rocsparse_csr2csc_buffer_size( + handle, m, n, nnz, dcsr_row_ptr, dcsr_col_ind, action, &size)); + + // Allocate buffer on the device + auto dbuffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * size), device_free}; + + void* dbuffer = (void*)dbuffer_managed.get(); + + if(!dbuffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dbuffer"); + return rocsparse_status_memory_error; + } + + if(argus.unit_check) + { + CHECK_ROCSPARSE_ERROR(rocsparse_csr2csc(handle, m, n, nnz, dcsr_val, dcsr_row_ptr, dcsr_col_ind, dcsc_val, dcsc_row_ind, dcsc_col_ptr, action, idx_base, dbuffer)); + + // Copy output from device to host + std::vector hcsc_row_ind(nnz); + std::vector hcsc_col_ptr(n + 1); + std::vector hcsc_val(nnz); + + CHECK_HIP_ERROR(hipMemcpy(hcsc_row_ind.data(), dcsc_row_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcsc_col_ptr.data(), dcsc_col_ptr, sizeof(rocsparse_int) * (n + 1), hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcsc_val.data(), dcsc_val, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + + // Host csr2csc conversion + std::vector hcsc_row_ind_gold(nnz); + std::vector hcsc_col_ptr_gold(n + 1, 0); + std::vector hcsc_val_gold(nnz); + + // Determine nnz per column + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsc_col_ptr_gold[hcsr_col_ind[i] + 1 - idx_base]; + } + + // Scan + for(rocsparse_int i = 0; i < n; ++i) + { + hcsc_col_ptr_gold[i + 1] += hcsc_col_ptr_gold[i]; + } + + // Fill row indices and values + for(rocsparse_int i = 0; i < m; ++i) + { + for(rocsparse_int j = hcsr_row_ptr[i]; j < hcsr_row_ptr[i + 1]; ++j) + { + rocsparse_int col = hcsr_col_ind[j - idx_base] - idx_base; + rocsparse_int idx = hcsc_col_ptr_gold[col]; + + hcsc_row_ind_gold[idx] = i + idx_base; + hcsc_val_gold[idx] = hcsr_val[j - idx_base]; + + ++hcsc_col_ptr_gold[col]; + } + } + + // Shift column pointer array + for(rocsparse_int i = n; i > 0; --i) + { + hcsc_col_ptr_gold[i] = hcsc_col_ptr_gold[i - 1] + idx_base; + } + + hcsc_col_ptr_gold[0] = idx_base; + + // Unit check + unit_check_general(1, nnz, hcsc_row_ind_gold.data(), hcsc_row_ind.data()); + unit_check_general(1, n + 1, hcsc_col_ptr_gold.data(), hcsc_col_ptr.data()); + + // If action == rocsparse_action_numeric also check values + if(action == rocsparse_action_numeric) + { + unit_check_general(1, nnz, hcsc_val_gold.data(), hcsc_val.data()); + } + } + + if(argus.timing) + { + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; + + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_csr2csc(handle, m, n, nnz, dcsr_val, dcsr_row_ptr, dcsr_col_ind, dcsc_val, dcsc_row_ind, dcsc_col_ptr, rocsparse_action_numeric, rocsparse_index_base_zero, dbuffer); + } + + double gpu_time_used = get_time_us(); + + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_csr2csc(handle, m, n, nnz, dcsr_val, dcsr_row_ptr, dcsr_col_ind, dcsc_val, dcsc_row_ind, dcsc_col_ptr, rocsparse_action_numeric, rocsparse_index_base_zero, dbuffer); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + + printf("m\t\tn\t\tnnz\t\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\n", m, n, nnz, gpu_time_used); + } + + return rocsparse_status_success; +} + +#endif // TESTING_CSR2CSC_HPP diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 0304b631..7d45261b 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -17,6 +17,7 @@ set(ROCSPARSE_TEST_SOURCES test_ellmv.cpp test_hybmv.cpp test_csr2coo.cpp + test_csr2csc.cpp test_csr2ell.cpp test_csr2hyb.cpp test_coo2csr.cpp diff --git a/clients/tests/test_csr2csc.cpp b/clients/tests/test_csr2csc.cpp new file mode 100644 index 00000000..009b1b31 --- /dev/null +++ b/clients/tests/test_csr2csc.cpp @@ -0,0 +1,65 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_csr2csc.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef std::tuple csr2csc_tuple; + +int csr2csc_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int csr2csc_N_range[] = {-3, 0, 33, 242, 623, 1000}; + +rocsparse_action csr2csc_action_range[] = {rocsparse_action_numeric, rocsparse_action_symbolic}; + +rocsparse_index_base csr2csc_csr_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; + +class parameterized_csr2csc : public testing::TestWithParam +{ + protected: + parameterized_csr2csc() {} + virtual ~parameterized_csr2csc() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csr2csc_arguments(csr2csc_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.action = std::get<2>(tup); + arg.idx_base = std::get<3>(tup); + arg.timing = 0; + return arg; +} + +TEST(csr2csc_bad_arg, csr2csc) { testing_csr2csc_bad_arg(); } + +TEST_P(parameterized_csr2csc, csr2csc_float) +{ + Arguments arg = setup_csr2csc_arguments(GetParam()); + + rocsparse_status status = testing_csr2csc(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csr2csc, csr2csc_double) +{ + Arguments arg = setup_csr2csc_arguments(GetParam()); + + rocsparse_status status = testing_csr2csc(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csr2csc, + parameterized_csr2csc, + testing::Combine(testing::ValuesIn(csr2csc_M_range), + testing::ValuesIn(csr2csc_N_range), + testing::ValuesIn(csr2csc_action_range), + testing::ValuesIn(csr2csc_csr_base_range))); From 0fb238994a0f9d9f4014e87c5d258a333e23a3b3 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 12 Jul 2018 16:17:39 +0200 Subject: [PATCH 152/304] printf output for reading matrix market files during tests/benchmarks --- clients/include/utility.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index a0800540..cb90375c 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -333,6 +333,9 @@ rocsparse_int read_mtx_matrix(const char* filename, std::vector& col, std::vector& val) { + printf("Reading matrix %s...", filename); + fflush(stdout); + FILE* f = fopen(filename, "r"); if(!f) { @@ -483,6 +486,9 @@ rocsparse_int read_mtx_matrix(const char* filename, val[i] = unsorted_val[perm[i]]; } + printf("done.\n"); + fflush(stdout); + return 0; } @@ -603,6 +609,7 @@ class Arguments rocsparse_operation trans = rocsparse_operation_none; rocsparse_index_base idx_base = rocsparse_index_base_zero; rocsparse_index_base idx_base2 = rocsparse_index_base_zero; + rocsparse_action action = rocsparse_action_numeric; rocsparse_hyb_partition part = rocsparse_hyb_partition_auto; rocsparse_int norm_check = 0; @@ -628,6 +635,7 @@ class Arguments trans = rhs.trans; idx_base = rhs.idx_base; idx_base2 = rhs.idx_base2; + action = rhs.action; part = rhs.part; norm_check = rhs.norm_check; From 1260ff06f64635d6ac9df6bfd8f7af40d858f8a3 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 12 Jul 2018 16:18:17 +0200 Subject: [PATCH 153/304] fix for csrsort test --- clients/include/testing_csrsort.hpp | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp index 66218900..6596f64c 100644 --- a/clients/include/testing_csrsort.hpp +++ b/clients/include/testing_csrsort.hpp @@ -173,14 +173,19 @@ rocsparse_status testing_csrsort(Arguments argus) rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto csr_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto perm_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto buffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + rocsparse_int* perm = (rocsparse_int*)perm_managed.get(); + void* buffer = (void*)buffer_managed.get(); - if(!csr_row_ptr || !csr_col_ind) + if(!csr_row_ptr || !csr_col_ind || !perm || !buffer) { verify_rocsparse_status_success(rocsparse_status_memory_error, - "!csr_row_ptr || !csr_col_ind"); + "!csr_row_ptr || !csr_col_ind || !perm || !buffer"); return rocsparse_status_memory_error; } @@ -200,6 +205,25 @@ rocsparse_status testing_csrsort(Arguments argus) unit_check_general(1, 1, &zero, &buffer_size); } + status = rocsparse_csrsort(handle, + m, + n, + nnz, + descr, + csr_row_ptr, + csr_col_ind, + perm, + buffer); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + } + return rocsparse_status_success; } From a3e1e3a509bb74d23b24b8b1fca832e1d329e5b3 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 12 Jul 2018 16:33:03 +0200 Subject: [PATCH 154/304] added hipcub to required cmake dependencies --- cmake/Dependencies.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index eadc9528..b064632e 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -122,6 +122,7 @@ if(HIP_PLATFORM STREQUAL "hcc") UPDATE_DISCONNECT TRUE ) find_package(ROCPRIM REQUIRED CONFIG PATHS ${ROCPRIM_ROOT}) + find_package(HIPCUB REQUIRED CONFIG PATHS ${ROCPRIM_ROOT}) endif() elseif(HIP_PLATFORM STREQUAL "nvcc") find_package(HIPCUB QUIET CONFIG PATHS /opt/rocm) From 8cb200353244184140f460bb462b5e8a055b319b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 12 Jul 2018 17:02:20 +0200 Subject: [PATCH 155/304] bugfix in coo2csr: binsearch missed out the very last element --- library/src/conversion/coo2csr_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/conversion/coo2csr_device.h b/library/src/conversion/coo2csr_device.h index c27fa48e..8f2afddf 100644 --- a/library/src/conversion/coo2csr_device.h +++ b/library/src/conversion/coo2csr_device.h @@ -53,7 +53,7 @@ __global__ void coo2csr_kernel(rocsparse_int m, } // Binary search - csr_row_ptr[gid] = lower_bound(coo_row_ind, gid + idx_base, 0, nnz - 1) + idx_base; + csr_row_ptr[gid] = lower_bound(coo_row_ind, gid + idx_base, 0, nnz) + idx_base; } #endif // COO2CSR_DEVICE_H From fb57d091184f31035c0c7707d4197c337f3f75c8 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 13 Jul 2018 07:33:05 +0200 Subject: [PATCH 156/304] coosort preparation --- clients/tests/CMakeLists.txt | 1 + library/include/rocsparse-functions.h | 117 ++++++++++++++++++++++++++ library/src/CMakeLists.txt | 1 + 3 files changed, 119 insertions(+) diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 0304b631..243e47e1 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -22,6 +22,7 @@ set(ROCSPARSE_TEST_SOURCES test_coo2csr.cpp test_identity.cpp test_csrsort.cpp + test_coosort.cpp ) set(ROCSPARSE_CLIENTS_COMMON diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index f5b8cb8b..2dae7b42 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -1170,6 +1170,123 @@ rocsparse_status rocsparse_csrsort(rocsparse_handle handle, rocsparse_int* perm, void* temp_buffer); +/*! \brief SPARSE Format Conversions API + + \details + coosort_buffer_size returns the size of the temporary storage buffer + that is required by coosort. The temporary storage buffer has to be + allocated by the user. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero elements of A. + @param[in] + coo_row_ind array of nnz elements containing the row indices of + A. + @param[in] + coo_col_ind array of nnz elements containing the column indices + of A. + @param[out] + buffer_size number of bytes of the temporary storage buffer. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_coosort_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + size_t* buffer_size); + +/*! \brief SPARSE Format Conversions API + + \details + coosort_by_row sorts a matrix in COO format by row. coosort requires + a temporary storage buffer. The sorted permutation vector perm can + be used to obtain sorted coo_val array. In this case, P must be + initialized as the identity permutation 0:1:(nnz-1). + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero elements of A. + @param[in] + coo_row_ind array of nnz elements containing the row indices of + A. + @param[inout] + coo_col_ind array of nnz elements containing the column indices + of A. + @param[inout] + perm array of nnz integers containing the unsorted map + indices. + @param[in] + temp_buffer temporary storage buffer allocated by the user, + size is returned by coosort_buffer_size + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + rocsparse_int* coo_row_ind, + rocsparse_int* coo_col_ind, + rocsparse_int* perm, + void* temp_buffer); + +/*! \brief SPARSE Format Conversions API + + \details + coosort_by_column sorts a matrix in COO format by column. coosort + requires a temporary storage buffer. The sorted permutation vector + perm can be used to obtain sorted coo_val array. In this case, P + must be initialized as the identity permutation 0:1:(nnz-1). + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero elements of A. + @param[in] + coo_row_ind array of nnz elements containing the row indices of + A. + @param[inout] + coo_col_ind array of nnz elements containing the column indices + of A. + @param[inout] + perm array of nnz integers containing the unsorted map + indices. + @param[in] + temp_buffer temporary storage buffer allocated by the user, + size is returned by coosort_buffer_size + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_coosort_by_column(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + rocsparse_int* coo_row_ind, + rocsparse_int* coo_col_ind, + rocsparse_int* perm, + void* temp_buffer); + #ifdef __cplusplus } #endif diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 508b0ae9..b0671dcf 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -30,4 +30,5 @@ set(rocsparse_source src/conversion/rocsparse_coo2csr.cpp src/conversion/rocsparse_identity.cpp src/conversion/rocsparse_csrsort.cpp + src/conversion/rocsparse_coosort.cpp ) From e110fe589b82cad2a316221e91a6440f8bc695ac Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 13 Jul 2018 07:37:39 +0200 Subject: [PATCH 157/304] bugfix in cmake where hipcub was not found --- cmake/Dependencies.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index b064632e..1c3f8563 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -107,6 +107,7 @@ endif() # rocPRIM package if(HIP_PLATFORM STREQUAL "hcc") find_package(ROCPRIM QUIET CONFIG PATHS /opt/rocm) + find_package(HIPCUB QUIET CONFIG PATHS /opt/rocm) if(NOT ROCPRIM_FOUND) set(ROCPRIM_ROOT ${CMAKE_CURRENT_BINARY_DIR}/rocPRIM CACHE PATH "") message(STATUS "Downloading rocPRIM.") From 82382682f3361e9456be53898ba7f9d1d28a2c70 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 13 Jul 2018 16:09:46 +0200 Subject: [PATCH 158/304] coosort_by_row and coosort_by_column routines added --- library/src/conversion/coosort_device.h | 27 ++ library/src/conversion/rocsparse_coosort.cpp | 406 +++++++++++++++++++ 2 files changed, 433 insertions(+) create mode 100644 library/src/conversion/coosort_device.h create mode 100644 library/src/conversion/rocsparse_coosort.cpp diff --git a/library/src/conversion/coosort_device.h b/library/src/conversion/coosort_device.h new file mode 100644 index 00000000..ab8700f2 --- /dev/null +++ b/library/src/conversion/coosort_device.h @@ -0,0 +1,27 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef COOSORT_DEVICE_H +#define COOSORT_DEVICE_H + +#include + +// COO to CSR matrix conversion kernel +__global__ void coosort_permute_kernel(rocsparse_int nnz, + const rocsparse_int* in, + const rocsparse_int* perm, + rocsparse_int* out) +{ + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + if(gid >= nnz) + { + return; + } + + out[gid] = in[perm[gid]]; +} + +#endif // COOSORT_DEVICE_H diff --git a/library/src/conversion/rocsparse_coosort.cpp b/library/src/conversion/rocsparse_coosort.cpp new file mode 100644 index 00000000..f6c97e5a --- /dev/null +++ b/library/src/conversion/rocsparse_coosort.cpp @@ -0,0 +1,406 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "coosort_device.h" + +#include +#include + +#if defined(__HIP_PLATFORM_HCC__) +#include +#endif + +extern "C" rocsparse_status rocsparse_coosort_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + size_t* buffer_size) +{ + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_coosort_buffer_size", + m, + n, + nnz, + (const void*&)coo_row_ind, + (const void*&)coo_col_ind, + (const void*&)buffer_size); + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(coo_row_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(coo_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(buffer_size == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + *buffer_size = 0; + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + rocsparse_int* ptr = reinterpret_cast(buffer_size); + + // Determine max buffer size + size_t size; + *buffer_size = 0; + hipcub::DoubleBuffer dummy(ptr, ptr); + + RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(nullptr, size, ptr, ptr, ptr, ptr, nnz, stream)); + *buffer_size = std::max(size, *buffer_size); + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(nullptr, size, ptr, ptr, m + 1, stream)); + *buffer_size = std::max(size, *buffer_size); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, dummy, dummy, nnz, 0, 32, stream)); + *buffer_size = std::max(size, *buffer_size); +#if defined(__HIP_PLATFORM_HCC__) + rocprim::double_buffer rpdummy(ptr, ptr); + + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, size, rpdummy, rpdummy, nnz, m, ptr, ptr + 1, 0, 32, stream)); + *buffer_size = std::max(size, *buffer_size); +#elif defined(__HIP_PLATFORM_NVCC__) + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, size, dummy, dummy, nnz, m, ptr, ptr, 0, 32, stream)); + *buffer_size = std::max(size, *buffer_size); +#endif + *buffer_size = ((*buffer_size - 1) / 256 + 1) * 256; + + // rocPRIM does not support in-place sorting, so we need additional buffer + // for all temporary arrays + + // rows buffer + *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; + // columns buffer + *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; + // perm buffer + *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; + // segment buffer + *buffer_size += sizeof(rocsparse_int) * (std::max(m, n) / 256 + 1) * 256; + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + rocsparse_int* coo_row_ind, + rocsparse_int* coo_col_ind, + rocsparse_int* perm, + void* temp_buffer) +{ + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_coosort_by_row", + m, + n, + nnz, + (const void*&)coo_row_ind, + (const void*&)coo_col_ind, + (const void*&)perm, + (const void*&)temp_buffer); + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(coo_row_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(coo_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(temp_buffer == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + unsigned int startbit = 0; + unsigned int endbit = rocsparse_clz(m); + + // Temporary buffer entry points + char* ptr = reinterpret_cast(temp_buffer); + + // Permutation vector given + rocsparse_int* work1 = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; + + rocsparse_int* work2 = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; + + rocsparse_int* work3 = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; + + rocsparse_int* work4 = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * (std::max(m, n) / 256 + 1) * 256; + + // Temporary rocprim buffer + size_t size = 0; + void* tmp_rocprim = reinterpret_cast(ptr); + + if(perm != nullptr) + { + // Create identitiy permutation to keep track of reorderings + RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_identity_permutation(handle, nnz, work1)); + + // Sort by rows and store permutation + hipcub::DoubleBuffer keys(coo_row_ind, work3); + hipcub::DoubleBuffer vals(work1, work2); + + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(tmp_rocprim, size, keys, vals, nnz, startbit, endbit, stream)); + + rocsparse_int* output = keys.Current(); + rocsparse_int* mapping = vals.Current(); + rocsparse_int* alt_map = vals.Alternate(); + + // Copy sorted rows, if stored in buffer + if(output != coo_row_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy(coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + + // Obtain segments for segmented sort by columns + RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(nullptr, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(tmp_rocprim, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); + + rocsparse_int nsegm; + RETURN_IF_HIP_ERROR(hipMemcpy(&nsegm, work3, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(nullptr, size, work4, work4, nsegm + 1, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(tmp_rocprim, size, work4, work4, nsegm + 1, stream)); + + // Reorder columns +#define COOSORT_DIM 512 + dim3 coosort_blocks((nnz - 1) / COOSORT_DIM + 1); + dim3 coosort_threads(COOSORT_DIM); +#undef COOSORT_DIM + + hipLaunchKernelGGL((coosort_permute_kernel), + coosort_blocks, + coosort_threads, + 0, + stream, + nnz, + coo_col_ind, + mapping, + work3); + + hipLaunchKernelGGL((coosort_permute_kernel), + coosort_blocks, + coosort_threads, + 0, + stream, + nnz, + perm, + mapping, + alt_map); + + // Sort columns per row + endbit = rocsparse_clz(n); + +#if defined(__HIP_PLATFORM_HCC__) + rocprim::double_buffer keys2(work3, coo_col_ind); + rocprim::double_buffer vals2(alt_map, perm); + + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + + rocsparse_int avg_row_nnz = nnz / nsegm; + + if(avg_row_nnz < 64) + { + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + } + else if(avg_row_nnz < 128) + { + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + } + else if(avg_row_nnz < 256) + { + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + } + else + { + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + } + + output = keys2.current(); + mapping = vals2.current(); +#elif defined(__HIP_PLATFORM_NVCC__) + hipcub::DoubleBuffer keys2(work3, coo_col_ind); + hipcub::DoubleBuffer vals2(alt_map, perm); + + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + + output = keys2.Current(); + mapping = vals2.Current(); +#endif + // Copy sorted columns, if stored in buffer + if(output != coo_col_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy(coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + + // Copy reordered permutation, if stored in buffer + if(mapping != perm) + { + RETURN_IF_HIP_ERROR(hipMemcpy(perm, mapping, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + } + else + { + // No permutation vector given + + // Sort by rows and permute columns +#if defined(__HIP_PLATFORM_HCC__) + rocprim::double_buffer keys(coo_row_ind, work3); + rocprim::double_buffer vals(coo_col_ind, work2); + + RETURN_IF_HIP_ERROR(rocprim::radix_sort_pairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(rocprim::radix_sort_pairs(tmp_rocprim, size, keys, vals, nnz, startbit, endbit, stream)); + rocsparse_int* output = keys.current(); +#elif defined(__HIP_PLATFORM_NVCC__) + hipcub::DoubleBuffer keys(coo_row_ind, work3); + hipcub::DoubleBuffer vals(coo_col_ind, work2); + + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(tmp_rocprim, size, keys, vals, nnz, startbit, endbit, stream)); + rocsparse_int* output = keys.Current(); +#endif + + // Copy sorted rows, if stored in buffer + if(output != coo_row_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy(coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + + // Obtain segments for segmented sort by columns + RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(nullptr, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(tmp_rocprim, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); + + rocsparse_int nsegm; + RETURN_IF_HIP_ERROR(hipMemcpy(&nsegm, work3, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(nullptr, size, work4, work4, nsegm + 1, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(tmp_rocprim, size, work4, work4, nsegm + 1, stream)); + + // Sort columns per row + endbit = rocsparse_clz(n); + +#if defined(__HIP_PLATFORM_HCC__) + rocsparse_int avg_row_nnz = nnz / nsegm; + + if(avg_row_nnz < 64) + { + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + } + else if(avg_row_nnz < 128) + { + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + } + else if(avg_row_nnz < 256) + { + using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + } + else + { + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + } + output = vals.current(); +#elif defined(__HIP_PLATFORM_NVCC__) + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + output = vals.Current(); +#endif + + // Copy sorted columns, if stored in buffer + if(output != coo_col_ind) + { + RETURN_IF_HIP_ERROR(hipMemcpy(coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + } + } + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_coosort_by_column(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + rocsparse_int* coo_row_ind, + rocsparse_int* coo_col_ind, + rocsparse_int* perm, + void* temp_buffer) +{ + return rocsparse_coosort_by_row(handle, n, m, nnz, coo_col_ind, coo_row_ind, perm, temp_buffer); +} From 0137478823e16f8b00d2eb3bed7f6b33b45926a7 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 13 Jul 2018 16:18:50 +0200 Subject: [PATCH 159/304] coosort tests and benchmark --- clients/benchmarks/client.cpp | 7 +- clients/include/testing_coosort.hpp | 508 ++++++++++++++++++++++++++++ clients/tests/test_coosort.cpp | 57 ++++ 3 files changed, 571 insertions(+), 1 deletion(-) create mode 100644 clients/include/testing_coosort.hpp create mode 100644 clients/tests/test_coosort.cpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index e6d157ed..df1319e0 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -27,6 +27,7 @@ #include "testing_coo2csr.hpp" #include "testing_identity.hpp" #include "testing_csrsort.hpp" +#include "testing_coosort.hpp" #include #include @@ -86,7 +87,7 @@ int main(int argc, char* argv[]) " Level2: coomv, csrmv, ellmv, hybmv\n" " Conversion: csr2coo, csr2csc, csr2ell,\n" " csr2hyb, coo2csr\n" - " Sorting: csrsort") + " Sorting: csrsort, coosort") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") @@ -245,6 +246,10 @@ int main(int argc, char* argv[]) { testing_csrsort(argus); } + else if(function == "coosort") + { + testing_coosort(argus); + } else { fprintf(stderr, "Invalid value for --function\n"); diff --git a/clients/include/testing_coosort.hpp b/clients/include/testing_coosort.hpp new file mode 100644 index 00000000..67930d18 --- /dev/null +++ b/clients/include/testing_coosort.hpp @@ -0,0 +1,508 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_COOSORT_HPP +#define TESTING_COOSORT_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +void testing_coosort_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int n = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + size_t buffer_size = 0; + + auto coo_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto coo_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto perm_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto buffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + + rocsparse_int* coo_row_ind = (rocsparse_int*)coo_row_ind_managed.get(); + rocsparse_int* coo_col_ind = (rocsparse_int*)coo_col_ind_managed.get(); + rocsparse_int* perm = (rocsparse_int*)perm_managed.get(); + void* buffer = (void*)buffer_managed.get(); + + if(!coo_row_ind || !coo_col_ind || !perm || !buffer) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // Testing coosort_buffer_size for bad args + + // Testing for (coo_row_ind == nullptr) + { + rocsparse_int* coo_row_ind_null = nullptr; + + status = rocsparse_coosort_buffer_size( + handle, m, n, nnz, coo_row_ind_null, coo_col_ind, &buffer_size); + verify_rocsparse_status_invalid_pointer(status, "Error: coo_row_ind is nullptr"); + } + + // Testing for (coo_col_ind == nullptr) + { + rocsparse_int* coo_col_ind_null = nullptr; + + status = rocsparse_coosort_buffer_size( + handle, m, n, nnz, coo_row_ind, coo_col_ind_null, &buffer_size); + verify_rocsparse_status_invalid_pointer(status, "Error: coo_col_ind is nullptr"); + } + + // Testing for (buffer_size == nullptr) + { + size_t* buffer_size_null = nullptr; + + status = rocsparse_coosort_buffer_size( + handle, m, n, nnz, coo_row_ind, coo_col_ind, buffer_size_null); + verify_rocsparse_status_invalid_pointer(status, "Error: buffer_size is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_coosort_buffer_size( + handle_null, m, n, nnz, coo_row_ind, coo_col_ind, &buffer_size); + verify_rocsparse_status_invalid_handle(status); + } + + // Testing coosort_by_row for bad args + + // Testing for (coo_row_ind == nullptr) + { + rocsparse_int* coo_row_ind_null = nullptr; + + status = rocsparse_coosort_by_row( + handle, m, n, nnz, coo_row_ind_null, coo_col_ind, perm, buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: coo_row_ind is nullptr"); + } + + // Testing for (coo_col_ind == nullptr) + { + rocsparse_int* coo_col_ind_null = nullptr; + + status = rocsparse_coosort_by_row( + handle, m, n, nnz, coo_row_ind, coo_col_ind_null, perm, buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: coo_col_ind is nullptr"); + } + + // Testing for (buffer == nullptr) + { + rocsparse_int* buffer_null = nullptr; + + status = rocsparse_coosort_by_row( + handle, m, n, nnz, coo_row_ind, coo_col_ind, perm, buffer_null); + verify_rocsparse_status_invalid_pointer(status, "Error: buffer is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_coosort_by_row( + handle_null, m, n, nnz, coo_row_ind, coo_col_ind, perm, buffer); + verify_rocsparse_status_invalid_handle(status); + } + + // Testing coosort_by_column for bad args + + // Testing for (coo_row_ind == nullptr) + { + rocsparse_int* coo_row_ind_null = nullptr; + + status = rocsparse_coosort_by_column( + handle, m, n, nnz, coo_row_ind_null, coo_col_ind, perm, buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: coo_row_ind is nullptr"); + } + + // Testing for (coo_col_ind == nullptr) + { + rocsparse_int* coo_col_ind_null = nullptr; + + status = rocsparse_coosort_by_column( + handle, m, n, nnz, coo_row_ind, coo_col_ind_null, perm, buffer); + verify_rocsparse_status_invalid_pointer(status, "Error: coo_col_ind is nullptr"); + } + + // Testing for (buffer == nullptr) + { + rocsparse_int* buffer_null = nullptr; + + status = rocsparse_coosort_by_column( + handle, m, n, nnz, coo_row_ind, coo_col_ind, perm, buffer_null); + verify_rocsparse_status_invalid_pointer(status, "Error: buffer is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_coosort_by_column( + handle_null, m, n, nnz, coo_row_ind, coo_col_ind, perm, buffer); + verify_rocsparse_status_invalid_handle(status); + } +} + +rocsparse_status testing_coosort(Arguments argus) +{ + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_int by_row = argus.trans == rocsparse_operation_none; + rocsparse_int permute = argus.temp; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_status status; + + size_t buffer_size = 0; + + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto coo_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto coo_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto perm_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto buffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + + rocsparse_int* coo_row_ind = (rocsparse_int*)coo_row_ind_managed.get(); + rocsparse_int* coo_col_ind = (rocsparse_int*)coo_col_ind_managed.get(); + rocsparse_int* perm = (rocsparse_int*)perm_managed.get(); + void* buffer = (void*)buffer_managed.get(); + + if(!coo_row_ind || !coo_col_ind || !perm || !buffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!coo_row_ind || !coo_col_ind || !perm || !buffer"); + return rocsparse_status_memory_error; + } + + status = rocsparse_coosort_buffer_size( + handle, m, n, nnz, coo_row_ind, coo_col_ind, &buffer_size); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + + // Buffer size should be zero + size_t zero = 0; + unit_check_general(1, 1, &zero, &buffer_size); + } + + if(by_row) + { + status = rocsparse_coosort_by_row(handle, + m, + n, + nnz, + coo_row_ind, + coo_col_ind, + perm, + buffer); + } + else + { + status = rocsparse_coosort_by_column(handle, + m, + n, + nnz, + coo_row_ind, + coo_col_ind, + perm, + buffer); + } + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + } + + return rocsparse_status_success; + } + + // For testing, assemble a COO matrix and convert it to CSR first (on host) + + // Host structures + std::vector hcoo_row_ind; + std::vector hcoo_col_ind; + std::vector hcoo_val; + + // Sample initial COO matrix on CPU + srand(12345ULL); + if(argus.laplacian) + { + std::vector hcsr_row_ptr; + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcoo_col_ind, hcoo_val, idx_base); + nnz = hcsr_row_ptr[m]; + + // Convert CSR to COO + hcoo_row_ind.resize(nnz); + for(rocsparse_int i = 0; i < m; ++i) + { + for(rocsparse_int j = hcsr_row_ptr[i]; j < hcsr_row_ptr[i + 1]; ++j) + { + hcoo_row_ind[j] = i + idx_base; + } + } + } + else + { + if(argus.filename != "") + { + if(read_mtx_matrix( + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base); + } + } + + // Unsort COO columns + std::vector hcoo_row_ind_unsorted(nnz); + std::vector hcoo_col_ind_unsorted(nnz); + std::vector hcoo_val_unsorted(nnz); + + hcoo_row_ind_unsorted = hcoo_row_ind; + hcoo_col_ind_unsorted = hcoo_col_ind; + hcoo_val_unsorted = hcoo_val; + + for(rocsparse_int i = 0; i < nnz; ++i) + { + rocsparse_int rng = rand() % nnz; + + rocsparse_int temp_row = hcoo_row_ind_unsorted[i]; + rocsparse_int temp_col = hcoo_col_ind_unsorted[i]; + float temp_val = hcoo_val_unsorted[i]; + + hcoo_row_ind_unsorted[i] = hcoo_row_ind_unsorted[rng]; + hcoo_col_ind_unsorted[i] = hcoo_col_ind_unsorted[rng]; + hcoo_val_unsorted[i] = hcoo_val_unsorted[rng]; + + hcoo_row_ind_unsorted[rng] = temp_row; + hcoo_col_ind_unsorted[rng] = temp_col; + hcoo_val_unsorted[rng] = temp_val; + } + + // If coosort by column, sort host arrays by column + if(!by_row) + { + std::vector hperm(nnz); + for(rocsparse_int i = 0; i < nnz; ++i) + { + hperm[i] = i; + } + + std::sort(hperm.begin(), hperm.end(), [&](const int& a, const int& b) { + if(hcoo_col_ind_unsorted[a] < hcoo_col_ind_unsorted[b]) + { + return true; + } + else if(hcoo_col_ind_unsorted[a] == hcoo_col_ind_unsorted[b]) + { + return (hcoo_row_ind_unsorted[a] < hcoo_row_ind_unsorted[b]); + } + else + { + return false; + } + }); + + for(rocsparse_int i = 0; i < nnz; ++i) + { + hcoo_row_ind[i] = hcoo_row_ind_unsorted[hperm[i]]; + hcoo_col_ind[i] = hcoo_col_ind_unsorted[hperm[i]]; + hcoo_val[i] = hcoo_val_unsorted[hperm[i]]; + } + } + + // Allocate memory on the device + auto dcoo_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcoo_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcoo_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(float) * nnz), device_free}; + auto dcoo_val_sorted_managed = + rocsparse_unique_ptr{device_malloc(sizeof(float) * nnz), device_free}; + auto dperm_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + + rocsparse_int* dcoo_row_ind = (rocsparse_int*)dcoo_row_ind_managed.get(); + rocsparse_int* dcoo_col_ind = (rocsparse_int*)dcoo_col_ind_managed.get(); + float* dcoo_val = (float*)dcoo_val_managed.get(); + float* dcoo_val_sorted = (float*)dcoo_val_sorted_managed.get(); + + // Set permutation vector, if asked for + rocsparse_int* dperm = permute ? (rocsparse_int*)dperm_managed.get() : nullptr; + + if(!dcoo_row_ind || !dcoo_col_ind || !dcoo_val || !dcoo_val_sorted || (permute && !dperm)) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dcoo_row_ind || !dcoo_col_ind || !dcoo_val || " + "!dcoo_val_sorted || (permute && !dperm)"); + return rocsparse_status_memory_error; + } + + // Copy data from host to device + CHECK_HIP_ERROR(hipMemcpy(dcoo_row_ind, + hcoo_row_ind_unsorted.data(), + sizeof(rocsparse_int) * nnz, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcoo_col_ind, + hcoo_col_ind_unsorted.data(), + sizeof(rocsparse_int) * nnz, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dcoo_val, hcoo_val_unsorted.data(), sizeof(float) * nnz, hipMemcpyHostToDevice)); + + if(argus.unit_check) + { + // Obtain buffer size + CHECK_ROCSPARSE_ERROR(rocsparse_coosort_buffer_size( + handle, m, n, nnz, dcoo_row_ind, dcoo_col_ind, &buffer_size)); + + // Allocate buffer on the device + auto dbuffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * buffer_size), device_free}; + + void* dbuffer = (void*)dbuffer_managed.get(); + + if(!dbuffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dbuffer"); + return rocsparse_status_memory_error; + } + + if(permute) + { + // Initialize perm with identity permutation + CHECK_ROCSPARSE_ERROR(rocsparse_create_identity_permutation(handle, nnz, dperm)); + } + + // Sort CSR columns + if(by_row) + { + CHECK_ROCSPARSE_ERROR(rocsparse_coosort_by_row( + handle, m, n, nnz, dcoo_row_ind, dcoo_col_ind, dperm, dbuffer)); + } + else + { + CHECK_ROCSPARSE_ERROR(rocsparse_coosort_by_column( + handle, m, n, nnz, dcoo_row_ind, dcoo_col_ind, dperm, dbuffer)); + } + + if(permute) + { + // Sort CSR values + CHECK_ROCSPARSE_ERROR(rocsparse_sgthr( + handle, nnz, dcoo_val, dcoo_val_sorted, dperm, rocsparse_index_base_zero)); + } + + // Copy output from device to host + CHECK_HIP_ERROR(hipMemcpy(hcoo_row_ind_unsorted.data(), + dcoo_row_ind, + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcoo_col_ind_unsorted.data(), + dcoo_col_ind, + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToHost)); + + if(permute) + { + CHECK_HIP_ERROR(hipMemcpy(hcoo_val_unsorted.data(), + dcoo_val_sorted, + sizeof(float) * nnz, + hipMemcpyDeviceToHost)); + } + + // Unit check + unit_check_general(1, nnz, hcoo_row_ind.data(), hcoo_row_ind_unsorted.data()); + unit_check_general(1, nnz, hcoo_col_ind.data(), hcoo_col_ind_unsorted.data()); + + if(permute) + { + unit_check_general(1, nnz, hcoo_val.data(), hcoo_val_unsorted.data()); + } + } + + if(argus.timing) + { + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; + + // Allocate buffer for coosort + rocsparse_coosort_buffer_size(handle, m, n, nnz, dcoo_row_ind, dcoo_col_ind, &buffer_size); + + auto dbuffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * buffer_size), device_free}; + void* dbuffer = (void*)dbuffer_managed.get(); + + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + rocsparse_coosort_by_row( + handle, m, n, nnz, dcoo_row_ind, dcoo_col_ind, nullptr, dbuffer); + } + + double gpu_time_used = get_time_us(); + + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + rocsparse_coosort_by_row( + handle, m, n, nnz, dcoo_row_ind, dcoo_col_ind, nullptr, dbuffer); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + + printf("m\t\tn\t\tnnz\t\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\n", m, n, nnz, gpu_time_used); + } + return rocsparse_status_success; +} + +#endif // TESTING_COOSORT_HPP diff --git a/clients/tests/test_coosort.cpp b/clients/tests/test_coosort.cpp new file mode 100644 index 00000000..c3e8fe44 --- /dev/null +++ b/clients/tests/test_coosort.cpp @@ -0,0 +1,57 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_coosort.hpp" +#include "utility.hpp" + +#include +#include +#include + +typedef std::tuple coosort_tuple; + +int coosort_M_range[] = {-1, 0, 10, 500, 3872, 10000}; +int coosort_N_range[] = {-3, 0, 33, 242, 1623, 10000}; +rocsparse_operation coosort_trans[] = {rocsparse_operation_none, rocsparse_operation_transpose}; +int coosort_perm[] = {0, 1}; +rocsparse_index_base coosort_base[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +class parameterized_coosort : public testing::TestWithParam +{ + protected: + parameterized_coosort() {} + virtual ~parameterized_coosort() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_coosort_arguments(coosort_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.trans = std::get<2>(tup); + arg.temp = std::get<3>(tup); + arg.idx_base = std::get<4>(tup); + arg.timing = 0; + return arg; +} + +TEST(coosort_bad_arg, coosort) { testing_coosort_bad_arg(); } + +TEST_P(parameterized_coosort, coosort) +{ + Arguments arg = setup_coosort_arguments(GetParam()); + + rocsparse_status status = testing_coosort(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(coosort, + parameterized_coosort, + testing::Combine(testing::ValuesIn(coosort_M_range), + testing::ValuesIn(coosort_N_range), + testing::ValuesIn(coosort_trans), + testing::ValuesIn(coosort_perm), + testing::ValuesIn(coosort_base))); From befef6517ac5a9bbf987e3230de56864e5acb603 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 13 Jul 2018 16:20:56 +0200 Subject: [PATCH 160/304] clang format --- clients/include/testing_coosort.hpp | 29 +-- clients/include/testing_csr2csc.hpp | 135 +++++++++----- clients/include/testing_csrsort.hpp | 18 +- clients/tests/test_coosort.cpp | 6 +- clients/tests/test_csr2csc.cpp | 10 +- library/src/conversion/rocsparse_coosort.cpp | 184 ++++++++++++++----- library/src/conversion/rocsparse_csr2csc.cpp | 3 +- library/src/conversion/rocsparse_csr2csc.hpp | 34 ++-- 8 files changed, 269 insertions(+), 150 deletions(-) diff --git a/clients/include/testing_coosort.hpp b/clients/include/testing_coosort.hpp index 67930d18..93dfdf1b 100644 --- a/clients/include/testing_coosort.hpp +++ b/clients/include/testing_coosort.hpp @@ -196,12 +196,13 @@ rocsparse_status testing_coosort(Arguments argus) rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto perm_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - auto buffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + auto buffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; rocsparse_int* coo_row_ind = (rocsparse_int*)coo_row_ind_managed.get(); rocsparse_int* coo_col_ind = (rocsparse_int*)coo_col_ind_managed.get(); - rocsparse_int* perm = (rocsparse_int*)perm_managed.get(); - void* buffer = (void*)buffer_managed.get(); + rocsparse_int* perm = (rocsparse_int*)perm_managed.get(); + void* buffer = (void*)buffer_managed.get(); if(!coo_row_ind || !coo_col_ind || !perm || !buffer) { @@ -228,25 +229,13 @@ rocsparse_status testing_coosort(Arguments argus) if(by_row) { - status = rocsparse_coosort_by_row(handle, - m, - n, - nnz, - coo_row_ind, - coo_col_ind, - perm, - buffer); + status = + rocsparse_coosort_by_row(handle, m, n, nnz, coo_row_ind, coo_col_ind, perm, buffer); } else { - status = rocsparse_coosort_by_column(handle, - m, - n, - nnz, - coo_row_ind, - coo_col_ind, - perm, - buffer); + status = rocsparse_coosort_by_column( + handle, m, n, nnz, coo_row_ind, coo_col_ind, perm, buffer); } if(m < 0 || n < 0 || nnz < 0) @@ -357,7 +346,7 @@ rocsparse_status testing_coosort(Arguments argus) { hcoo_row_ind[i] = hcoo_row_ind_unsorted[hperm[i]]; hcoo_col_ind[i] = hcoo_col_ind_unsorted[hperm[i]]; - hcoo_val[i] = hcoo_val_unsorted[hperm[i]]; + hcoo_val[i] = hcoo_val_unsorted[hperm[i]]; } } diff --git a/clients/include/testing_csr2csc.hpp b/clients/include/testing_csr2csc.hpp index 8d290840..aea17b66 100644 --- a/clients/include/testing_csr2csc.hpp +++ b/clients/include/testing_csr2csc.hpp @@ -41,7 +41,8 @@ void testing_csr2csc_bad_arg(void) auto csc_col_ptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto csc_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; - auto buffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + auto buffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); @@ -51,7 +52,8 @@ void testing_csr2csc_bad_arg(void) T* csc_val = (T*)csc_val_managed.get(); void* buffer = (void*)buffer_managed.get(); - if(!csr_row_ptr || !csr_col_ind || !csr_val || !csc_row_ind || !csc_col_ptr || !csc_val || !buffer) + if(!csr_row_ptr || !csr_col_ind || !csr_val || !csc_row_ind || !csc_col_ptr || !csc_val || + !buffer) { PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); return; @@ -63,14 +65,8 @@ void testing_csr2csc_bad_arg(void) { rocsparse_int* csr_row_ptr_null = nullptr; - status = rocsparse_csr2csc_buffer_size(handle, - m, - n, - nnz, - csr_row_ptr_null, - csr_col_ind, - rocsparse_action_numeric, - &size); + status = rocsparse_csr2csc_buffer_size( + handle, m, n, nnz, csr_row_ptr_null, csr_col_ind, rocsparse_action_numeric, &size); verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); } @@ -78,14 +74,8 @@ void testing_csr2csc_bad_arg(void) { rocsparse_int* csr_col_ind_null = nullptr; - status = rocsparse_csr2csc_buffer_size(handle, - m, - n, - nnz, - csr_row_ptr, - csr_col_ind_null, - rocsparse_action_numeric, - &size); + status = rocsparse_csr2csc_buffer_size( + handle, m, n, nnz, csr_row_ptr, csr_col_ind_null, rocsparse_action_numeric, &size); verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); } @@ -108,14 +98,8 @@ void testing_csr2csc_bad_arg(void) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csr2csc_buffer_size(handle_null, - m, - n, - nnz, - csr_row_ptr, - csr_col_ind, - rocsparse_action_numeric, - &size); + status = rocsparse_csr2csc_buffer_size( + handle_null, m, n, nnz, csr_row_ptr, csr_col_ind, rocsparse_action_numeric, &size); verify_rocsparse_status_invalid_handle(status); } @@ -307,13 +291,20 @@ rocsparse_status testing_csr2csc(Arguments argus) // Argument sanity check before allocating invalid memory if(m <= 0 || n <= 0 || nnz <= 0) { - auto csr_row_ptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - auto csr_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - auto csr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; - auto csc_row_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - auto csc_col_ptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - auto csc_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; - auto buffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto csc_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csc_col_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csc_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto buffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); @@ -321,9 +312,10 @@ rocsparse_status testing_csr2csc(Arguments argus) rocsparse_int* csc_row_ind = (rocsparse_int*)csc_row_ind_managed.get(); rocsparse_int* csc_col_ptr = (rocsparse_int*)csc_col_ptr_managed.get(); T* csc_val = (T*)csc_val_managed.get(); - void* buffer = (void*)buffer_managed.get(); + void* buffer = (void*)buffer_managed.get(); - if(!csr_row_ptr || !csr_col_ind || !csr_val || !csc_row_ind || !csc_col_ptr || !csc_val || !buffer) + if(!csr_row_ptr || !csr_col_ind || !csr_val || !csc_row_ind || !csc_col_ptr || !csc_val || + !buffer) { verify_rocsparse_status_success(rocsparse_status_memory_error, "!csr_row_ptr || !csr_col_ind || !csr_val || " @@ -418,11 +410,15 @@ rocsparse_status testing_csr2csc(Arguments argus) } // Allocate memory on the device - auto dcsr_row_ptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; - auto dcsr_col_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcsr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcsr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; auto dcsr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; - auto dcsc_row_ind_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; - auto dcsc_col_ptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (n + 1)), device_free}; + auto dcsc_row_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcsc_col_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (n + 1)), device_free}; auto dcsc_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; rocsparse_int* dcsr_row_ptr = (rocsparse_int*)dcsr_row_ptr_managed.get(); @@ -446,8 +442,10 @@ rocsparse_status testing_csr2csc(Arguments argus) CHECK_HIP_ERROR(hipMemset(dcsc_val, 0, sizeof(T) * nnz)); // Copy data from host to device - CHECK_HIP_ERROR(hipMemcpy(dcsr_row_ptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dcsr_col_ind, hcsr_col_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy( + dcsr_row_ptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy( + dcsr_col_ind, hcsr_col_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dcsr_val, hcsr_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); // Obtain buffer size @@ -467,16 +465,33 @@ rocsparse_status testing_csr2csc(Arguments argus) if(argus.unit_check) { - CHECK_ROCSPARSE_ERROR(rocsparse_csr2csc(handle, m, n, nnz, dcsr_val, dcsr_row_ptr, dcsr_col_ind, dcsc_val, dcsc_row_ind, dcsc_col_ptr, action, idx_base, dbuffer)); + CHECK_ROCSPARSE_ERROR(rocsparse_csr2csc(handle, + m, + n, + nnz, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + dcsc_val, + dcsc_row_ind, + dcsc_col_ptr, + action, + idx_base, + dbuffer)); // Copy output from device to host std::vector hcsc_row_ind(nnz); std::vector hcsc_col_ptr(n + 1); std::vector hcsc_val(nnz); - CHECK_HIP_ERROR(hipMemcpy(hcsc_row_ind.data(), dcsc_row_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hcsc_col_ptr.data(), dcsc_col_ptr, sizeof(rocsparse_int) * (n + 1), hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hcsc_val.data(), dcsc_val, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + hcsc_row_ind.data(), dcsc_row_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcsc_col_ptr.data(), + dcsc_col_ptr, + sizeof(rocsparse_int) * (n + 1), + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(hcsc_val.data(), dcsc_val, sizeof(T) * nnz, hipMemcpyDeviceToHost)); // Host csr2csc conversion std::vector hcsc_row_ind_gold(nnz); @@ -504,7 +519,7 @@ rocsparse_status testing_csr2csc(Arguments argus) rocsparse_int idx = hcsc_col_ptr_gold[col]; hcsc_row_ind_gold[idx] = i + idx_base; - hcsc_val_gold[idx] = hcsr_val[j - idx_base]; + hcsc_val_gold[idx] = hcsr_val[j - idx_base]; ++hcsc_col_ptr_gold[col]; } @@ -536,14 +551,38 @@ rocsparse_status testing_csr2csc(Arguments argus) for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) { - rocsparse_csr2csc(handle, m, n, nnz, dcsr_val, dcsr_row_ptr, dcsr_col_ind, dcsc_val, dcsc_row_ind, dcsc_col_ptr, rocsparse_action_numeric, rocsparse_index_base_zero, dbuffer); + rocsparse_csr2csc(handle, + m, + n, + nnz, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + dcsc_val, + dcsc_row_ind, + dcsc_col_ptr, + rocsparse_action_numeric, + rocsparse_index_base_zero, + dbuffer); } double gpu_time_used = get_time_us(); for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) { - rocsparse_csr2csc(handle, m, n, nnz, dcsr_val, dcsr_row_ptr, dcsr_col_ind, dcsc_val, dcsc_row_ind, dcsc_col_ptr, rocsparse_action_numeric, rocsparse_index_base_zero, dbuffer); + rocsparse_csr2csc(handle, + m, + n, + nnz, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + dcsc_val, + dcsc_row_ind, + dcsc_col_ptr, + rocsparse_action_numeric, + rocsparse_index_base_zero, + dbuffer); } gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp index 6596f64c..6236fb7b 100644 --- a/clients/include/testing_csrsort.hpp +++ b/clients/include/testing_csrsort.hpp @@ -175,12 +175,13 @@ rocsparse_status testing_csrsort(Arguments argus) rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto perm_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; - auto buffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + auto buffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); - rocsparse_int* perm = (rocsparse_int*)perm_managed.get(); - void* buffer = (void*)buffer_managed.get(); + rocsparse_int* perm = (rocsparse_int*)perm_managed.get(); + void* buffer = (void*)buffer_managed.get(); if(!csr_row_ptr || !csr_col_ind || !perm || !buffer) { @@ -205,15 +206,8 @@ rocsparse_status testing_csrsort(Arguments argus) unit_check_general(1, 1, &zero, &buffer_size); } - status = rocsparse_csrsort(handle, - m, - n, - nnz, - descr, - csr_row_ptr, - csr_col_ind, - perm, - buffer); + status = + rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm, buffer); if(m < 0 || n < 0 || nnz < 0) { diff --git a/clients/tests/test_coosort.cpp b/clients/tests/test_coosort.cpp index c3e8fe44..624c799b 100644 --- a/clients/tests/test_coosort.cpp +++ b/clients/tests/test_coosort.cpp @@ -11,10 +11,10 @@ typedef std::tuple coosort_tuple; -int coosort_M_range[] = {-1, 0, 10, 500, 3872, 10000}; -int coosort_N_range[] = {-3, 0, 33, 242, 1623, 10000}; +int coosort_M_range[] = {-1, 0, 10, 500, 3872, 10000}; +int coosort_N_range[] = {-3, 0, 33, 242, 1623, 10000}; rocsparse_operation coosort_trans[] = {rocsparse_operation_none, rocsparse_operation_transpose}; -int coosort_perm[] = {0, 1}; +int coosort_perm[] = {0, 1}; rocsparse_index_base coosort_base[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; class parameterized_coosort : public testing::TestWithParam diff --git a/clients/tests/test_csr2csc.cpp b/clients/tests/test_csr2csc.cpp index 009b1b31..121d3fe6 100644 --- a/clients/tests/test_csr2csc.cpp +++ b/clients/tests/test_csr2csc.cpp @@ -31,11 +31,11 @@ class parameterized_csr2csc : public testing::TestWithParam Arguments setup_csr2csc_arguments(csr2csc_tuple tup) { Arguments arg; - arg.M = std::get<0>(tup); - arg.N = std::get<1>(tup); - arg.action = std::get<2>(tup); - arg.idx_base = std::get<3>(tup); - arg.timing = 0; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.action = std::get<2>(tup); + arg.idx_base = std::get<3>(tup); + arg.timing = 0; return arg; } diff --git a/library/src/conversion/rocsparse_coosort.cpp b/library/src/conversion/rocsparse_coosort.cpp index f6c97e5a..09ba1e34 100644 --- a/library/src/conversion/rocsparse_coosort.cpp +++ b/library/src/conversion/rocsparse_coosort.cpp @@ -84,19 +84,23 @@ extern "C" rocsparse_status rocsparse_coosort_buffer_size(rocsparse_handle handl *buffer_size = 0; hipcub::DoubleBuffer dummy(ptr, ptr); - RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(nullptr, size, ptr, ptr, ptr, ptr, nnz, stream)); + RETURN_IF_HIP_ERROR( + hipcub::DeviceRunLengthEncode::Encode(nullptr, size, ptr, ptr, ptr, ptr, nnz, stream)); *buffer_size = std::max(size, *buffer_size); RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(nullptr, size, ptr, ptr, m + 1, stream)); *buffer_size = std::max(size, *buffer_size); - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, dummy, dummy, nnz, 0, 32, stream)); + RETURN_IF_HIP_ERROR( + hipcub::DeviceRadixSort::SortPairs(nullptr, size, dummy, dummy, nnz, 0, 32, stream)); *buffer_size = std::max(size, *buffer_size); #if defined(__HIP_PLATFORM_HCC__) rocprim::double_buffer rpdummy(ptr, ptr); - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, size, rpdummy, rpdummy, nnz, m, ptr, ptr + 1, 0, 32, stream)); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( + nullptr, size, rpdummy, rpdummy, nnz, m, ptr, ptr + 1, 0, 32, stream)); *buffer_size = std::max(size, *buffer_size); #elif defined(__HIP_PLATFORM_NVCC__) - RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, size, dummy, dummy, nnz, m, ptr, ptr, 0, 32, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs( + nullptr, size, dummy, dummy, nnz, m, ptr, ptr, 0, 32, stream)); *buffer_size = std::max(size, *buffer_size); #endif *buffer_size = ((*buffer_size - 1) / 256 + 1) * 256; @@ -199,7 +203,7 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, ptr += sizeof(rocsparse_int) * (std::max(m, n) / 256 + 1) * 256; // Temporary rocprim buffer - size_t size = 0; + size_t size = 0; void* tmp_rocprim = reinterpret_cast(ptr); if(perm != nullptr) @@ -211,30 +215,37 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, hipcub::DoubleBuffer keys(coo_row_ind, work3); hipcub::DoubleBuffer vals(work1, work2); - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(tmp_rocprim, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( + nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( + tmp_rocprim, size, keys, vals, nnz, startbit, endbit, stream)); - rocsparse_int* output = keys.Current(); + rocsparse_int* output = keys.Current(); rocsparse_int* mapping = vals.Current(); rocsparse_int* alt_map = vals.Alternate(); // Copy sorted rows, if stored in buffer if(output != coo_row_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy(coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy( + coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); } // Obtain segments for segmented sort by columns - RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(nullptr, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); - RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(tmp_rocprim, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode( + nullptr, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode( + tmp_rocprim, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); rocsparse_int nsegm; RETURN_IF_HIP_ERROR(hipMemcpy(&nsegm, work3, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(nullptr, size, work4, work4, nsegm + 1, stream)); - RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(tmp_rocprim, size, work4, work4, nsegm + 1, stream)); + RETURN_IF_HIP_ERROR( + hipcub::DeviceScan::ExclusiveSum(nullptr, size, work4, work4, nsegm + 1, stream)); + RETURN_IF_HIP_ERROR( + hipcub::DeviceScan::ExclusiveSum(tmp_rocprim, size, work4, work4, nsegm + 1, stream)); - // Reorder columns +// Reorder columns #define COOSORT_DIM 512 dim3 coosort_blocks((nnz - 1) / COOSORT_DIM + 1); dim3 coosort_threads(COOSORT_DIM); @@ -267,90 +278,156 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, rocprim::double_buffer keys2(work3, coo_col_ind); rocprim::double_buffer vals2(alt_map, perm); - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(nullptr, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs( + nullptr, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); rocsparse_int avg_row_nnz = nnz / nsegm; if(avg_row_nnz < 64) { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, + size, + keys2, + vals2, + nnz, + nsegm, + work4, + work4 + 1, + startbit, + endbit, + stream)); } else if(avg_row_nnz < 128) { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, + size, + keys2, + vals2, + nnz, + nsegm, + work4, + work4 + 1, + startbit, + endbit, + stream)); } else if(avg_row_nnz < 256) { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, + size, + keys2, + vals2, + nnz, + nsegm, + work4, + work4 + 1, + startbit, + endbit, + stream)); } else { - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_pairs(tmp_rocprim, + size, + keys2, + vals2, + nnz, + nsegm, + work4, + work4 + 1, + startbit, + endbit, + stream)); } - output = keys2.current(); + output = keys2.current(); mapping = vals2.current(); #elif defined(__HIP_PLATFORM_NVCC__) hipcub::DoubleBuffer keys2(work3, coo_col_ind); hipcub::DoubleBuffer vals2(alt_map, perm); - RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(nullptr, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); - RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(tmp_rocprim, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); - - output = keys2.Current(); + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs( + nullptr, size, keys2, vals2, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs(tmp_rocprim, + size, + keys2, + vals2, + nnz, + nsegm, + work4, + work4 + 1, + startbit, + endbit, + stream)); + + output = keys2.Current(); mapping = vals2.Current(); #endif // Copy sorted columns, if stored in buffer if(output != coo_col_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy(coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy( + coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); } // Copy reordered permutation, if stored in buffer if(mapping != perm) { - RETURN_IF_HIP_ERROR(hipMemcpy(perm, mapping, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR( + hipMemcpy(perm, mapping, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); } } else { - // No permutation vector given +// No permutation vector given - // Sort by rows and permute columns +// Sort by rows and permute columns #if defined(__HIP_PLATFORM_HCC__) rocprim::double_buffer keys(coo_row_ind, work3); rocprim::double_buffer vals(coo_col_ind, work2); - RETURN_IF_HIP_ERROR(rocprim::radix_sort_pairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); - RETURN_IF_HIP_ERROR(rocprim::radix_sort_pairs(tmp_rocprim, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR( + rocprim::radix_sort_pairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(rocprim::radix_sort_pairs( + tmp_rocprim, size, keys, vals, nnz, startbit, endbit, stream)); rocsparse_int* output = keys.current(); #elif defined(__HIP_PLATFORM_NVCC__) hipcub::DoubleBuffer keys(coo_row_ind, work3); hipcub::DoubleBuffer vals(coo_col_ind, work2); - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(tmp_rocprim, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( + nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( + tmp_rocprim, size, keys, vals, nnz, startbit, endbit, stream)); rocsparse_int* output = keys.Current(); #endif // Copy sorted rows, if stored in buffer if(output != coo_row_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy(coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy( + coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); } // Obtain segments for segmented sort by columns - RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(nullptr, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); - RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode(tmp_rocprim, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode( + nullptr, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRunLengthEncode::Encode( + tmp_rocprim, size, coo_row_ind, work3 + 1, work4, work3, nnz, stream)); rocsparse_int nsegm; RETURN_IF_HIP_ERROR(hipMemcpy(&nsegm, work3, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(nullptr, size, work4, work4, nsegm + 1, stream)); - RETURN_IF_HIP_ERROR(hipcub::DeviceScan::ExclusiveSum(tmp_rocprim, size, work4, work4, nsegm + 1, stream)); + RETURN_IF_HIP_ERROR( + hipcub::DeviceScan::ExclusiveSum(nullptr, size, work4, work4, nsegm + 1, stream)); + RETURN_IF_HIP_ERROR( + hipcub::DeviceScan::ExclusiveSum(tmp_rocprim, size, work4, work4, nsegm + 1, stream)); // Sort columns per row endbit = rocsparse_clz(n); @@ -360,33 +437,42 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, if(avg_row_nnz < 64) { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 1>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( + tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); } else if(avg_row_nnz < 128) { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 2>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( + tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); } else if(avg_row_nnz < 256) { - using config = rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + using config = + rocprim::segmented_radix_sort_config<6, 5, rocprim::kernel_config<64, 4>>; + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( + tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); } else { - RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(rocprim::segmented_radix_sort_keys( + tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); } output = vals.current(); #elif defined(__HIP_PLATFORM_NVCC__) - RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys(tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortKeys( + tmp_rocprim, size, vals, nnz, nsegm, work4, work4 + 1, startbit, endbit, stream)); output = vals.Current(); #endif // Copy sorted columns, if stored in buffer if(output != coo_col_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy(coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy( + coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); } } diff --git a/library/src/conversion/rocsparse_csr2csc.cpp b/library/src/conversion/rocsparse_csr2csc.cpp index cf62f5e0..6a3445ba 100644 --- a/library/src/conversion/rocsparse_csr2csc.cpp +++ b/library/src/conversion/rocsparse_csr2csc.cpp @@ -81,7 +81,8 @@ extern "C" rocsparse_status rocsparse_csr2csc_buffer_size(rocsparse_handle handl rocsparse_int* ptr = reinterpret_cast(buffer_size); hipcub::DoubleBuffer dummy(ptr, ptr); - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, *buffer_size, dummy, dummy, nnz, 0, 32, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( + nullptr, *buffer_size, dummy, dummy, nnz, 0, 32, stream)); // rocPRIM does not support in-place sorting, so we need additional buffer // for all temporary arrays diff --git a/library/src/conversion/rocsparse_csr2csc.hpp b/library/src/conversion/rocsparse_csr2csc.hpp index 06392068..876ea6cf 100644 --- a/library/src/conversion/rocsparse_csr2csc.hpp +++ b/library/src/conversion/rocsparse_csr2csc.hpp @@ -104,7 +104,7 @@ rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, hipStream_t stream = handle->stream; unsigned int startbit = 0; - unsigned int endbit = rocsparse_clz(n); + unsigned int endbit = rocsparse_clz(n); // Temporary buffer entry points char* ptr = reinterpret_cast(temp_buffer); @@ -125,30 +125,36 @@ rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, void* tmp_hipcub = reinterpret_cast(ptr); // Load CSR column indices into work1 buffer - RETURN_IF_HIP_ERROR(hipMemcpy(tmp_work1, csr_col_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR( + hipMemcpy(tmp_work1, csr_col_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); if(copy_values == rocsparse_action_symbolic) { // action symbolic // Create row indices - RETURN_IF_ROCSPARSE_ERROR(rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, csc_row_ind, idx_base)); + RETURN_IF_ROCSPARSE_ERROR( + rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, csc_row_ind, idx_base)); // Stable sort COO by columns hipcub::DoubleBuffer keys(tmp_work1, tmp_perm); hipcub::DoubleBuffer vals(csc_row_ind, tmp_work2); size_t size = 0; - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(tmp_hipcub, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( + nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( + tmp_hipcub, size, keys, vals, nnz, startbit, endbit, stream)); // Create column pointers - RETURN_IF_ROCSPARSE_ERROR(rocsparse_coo2csr(handle, keys.Current(), nnz, n, csc_col_ptr, idx_base)); + RETURN_IF_ROCSPARSE_ERROR( + rocsparse_coo2csr(handle, keys.Current(), nnz, n, csc_col_ptr, idx_base)); // Copy csc_row_ind if not current if(vals.Current() != csc_row_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy(csc_row_ind, vals.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy( + csc_row_ind, vals.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); } } else @@ -164,16 +170,20 @@ rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, size_t size = 0; - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(nullptr, size, keys, vals, nnz, startbit, endbit, stream)); - RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs(tmp_hipcub, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( + nullptr, size, keys, vals, nnz, startbit, endbit, stream)); + RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( + tmp_hipcub, size, keys, vals, nnz, startbit, endbit, stream)); // Create column pointers - RETURN_IF_ROCSPARSE_ERROR(rocsparse_coo2csr(handle, keys.Current(), nnz, n, csc_col_ptr, idx_base)); + RETURN_IF_ROCSPARSE_ERROR( + rocsparse_coo2csr(handle, keys.Current(), nnz, n, csc_col_ptr, idx_base)); // Create row indices - RETURN_IF_ROCSPARSE_ERROR(rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, tmp_work1, idx_base)); + RETURN_IF_ROCSPARSE_ERROR( + rocsparse_csr2coo(handle, csr_row_ptr, nnz, m, tmp_work1, idx_base)); - // Permute row indices and values +// Permute row indices and values #define CSR2CSC_DIM 512 dim3 csr2csc_blocks((nnz - 1) / CSR2CSC_DIM + 1); dim3 csr2csc_threads(CSR2CSC_DIM); From 33b45046d4817a3ee776541711ba7ee1ba5fdd01 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 13 Jul 2018 16:28:48 +0200 Subject: [PATCH 161/304] replaced __llvm by __hip intrinsics --- library/src/level2/csrmv_device.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 8b0c672c..dac0cd59 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -6,7 +6,6 @@ #if defined(__HIP_PLATFORM_HCC__) // While HIP does not contain llvm intrinsics -__device__ int __llvm_amdgcn_ds_swizzle(int index, int pattern) __asm("llvm.amdgcn.ds.swizzle"); __device__ int __llvm_amdgcn_readlane(int index, int offset) __asm("llvm.amdgcn.readlane"); #endif @@ -17,11 +16,11 @@ __device__ float reduction(float sum) { // clang-format off if(SUBWAVE_SIZE > 32) sum += __llvm_amdgcn_readlane(sum, 32); - if(SUBWAVE_SIZE > 16) sum += __llvm_amdgcn_ds_swizzle(sum, 0x401f); - if(SUBWAVE_SIZE > 8) sum += __llvm_amdgcn_ds_swizzle(sum, 0x201f); - if(SUBWAVE_SIZE > 4) sum += __llvm_amdgcn_ds_swizzle(sum, 0x101f); - if(SUBWAVE_SIZE > 2) sum += __llvm_amdgcn_ds_swizzle(sum, 0x081f); - if(SUBWAVE_SIZE > 1) sum += __llvm_amdgcn_ds_swizzle(sum, 0x041f); + if(SUBWAVE_SIZE > 16) sum += __hip_ds_swizzle(sum, 0x401f); + if(SUBWAVE_SIZE > 8) sum += __hip_ds_swizzle(sum, 0x201f); + if(SUBWAVE_SIZE > 4) sum += __hip_ds_swizzle(sum, 0x101f); + if(SUBWAVE_SIZE > 2) sum += __hip_ds_swizzle(sum, 0x081f); + if(SUBWAVE_SIZE > 1) sum += __hip_ds_swizzle(sum, 0x041f); // clang-format on return sum; @@ -51,36 +50,36 @@ __device__ double reduction(double sum) if(SUBWAVE_SIZE > 16) { - upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x401f); - upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x401f); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x401f); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x401f); temp_sum.val += upper_sum.val; } if(SUBWAVE_SIZE > 8) { - upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x201f); - upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x201f); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x201f); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x201f); temp_sum.val += upper_sum.val; } if(SUBWAVE_SIZE > 4) { - upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x101f); - upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x101f); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x101f); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x101f); temp_sum.val += upper_sum.val; } if(SUBWAVE_SIZE > 2) { - upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x081f); - upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x081f); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x081f); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x081f); temp_sum.val += upper_sum.val; } if(SUBWAVE_SIZE > 1) { - upper_sum.b32[0] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[0], 0x041f); - upper_sum.b32[1] = __llvm_amdgcn_ds_swizzle(temp_sum.b32[1], 0x041f); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x041f); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x041f); temp_sum.val += upper_sum.val; } From 508456f67fdfcfe6850936e7675dcc5900b65f61 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 13 Jul 2018 16:39:49 +0200 Subject: [PATCH 162/304] changed buffer memory alignment to 256 bytes --- library/src/conversion/rocsparse_csr2csc.cpp | 6 +++++- library/src/conversion/rocsparse_csr2csc.hpp | 6 +++--- library/src/conversion/rocsparse_csrsort.cpp | 19 +++++++++---------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/library/src/conversion/rocsparse_csr2csc.cpp b/library/src/conversion/rocsparse_csr2csc.cpp index 6a3445ba..9ec4f4ff 100644 --- a/library/src/conversion/rocsparse_csr2csc.cpp +++ b/library/src/conversion/rocsparse_csr2csc.cpp @@ -84,9 +84,13 @@ extern "C" rocsparse_status rocsparse_csr2csc_buffer_size(rocsparse_handle handl RETURN_IF_HIP_ERROR(hipcub::DeviceRadixSort::SortPairs( nullptr, *buffer_size, dummy, dummy, nnz, 0, 32, stream)); + *buffer_size = ((*buffer_size - 1) / 256 + 1) * 256; + // rocPRIM does not support in-place sorting, so we need additional buffer // for all temporary arrays - *buffer_size += sizeof(rocsparse_int) * nnz * 3; + *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; + *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; + *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; return rocsparse_status_success; } diff --git a/library/src/conversion/rocsparse_csr2csc.hpp b/library/src/conversion/rocsparse_csr2csc.hpp index 876ea6cf..64de0a3b 100644 --- a/library/src/conversion/rocsparse_csr2csc.hpp +++ b/library/src/conversion/rocsparse_csr2csc.hpp @@ -111,15 +111,15 @@ rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, // work1 buffer rocsparse_int* tmp_work1 = reinterpret_cast(ptr); - ptr += sizeof(rocsparse_int) * nnz; + ptr += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; // work2 buffer rocsparse_int* tmp_work2 = reinterpret_cast(ptr); - ptr += sizeof(rocsparse_int) * nnz; + ptr += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; // perm buffer rocsparse_int* tmp_perm = reinterpret_cast(ptr); - ptr += sizeof(rocsparse_int) * nnz; + ptr += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; // hipcub buffer void* tmp_hipcub = reinterpret_cast(ptr); diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index ddb9256d..3febe9cd 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -90,16 +90,17 @@ extern "C" rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handl RETURN_IF_HIP_ERROR(hipcub::DeviceSegmentedRadixSort::SortPairs( nullptr, *buffer_size, dummy, dummy, nnz, m, buffer_size, buffer_size, 0, 32, stream)); #endif + *buffer_size = ((*buffer_size - 1) / 256 + 1) * 256; // rocPRIM does not support in-place sorting, so we need additional buffer // for all temporary arrays // columns buffer - *buffer_size += sizeof(rocsparse_int) * nnz; + *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; // perm buffer - *buffer_size += sizeof(rocsparse_int) * nnz; + *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; // segm buffer - *buffer_size += sizeof(rocsparse_int) * (m + 1); + *buffer_size += sizeof(rocsparse_int) * (m / 256 + 1) * 256; return rocsparse_status_success; } @@ -231,21 +232,19 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, // columns buffer rocsparse_int* tmp_cols = reinterpret_cast(ptr); - ptr += sizeof(rocsparse_int) * nnz; + ptr += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; // perm buffer rocsparse_int* tmp_perm = reinterpret_cast(ptr); - ptr += sizeof(rocsparse_int) * nnz; + ptr += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; // segm buffer - rocsparse_int* tmp_segm = nullptr; + rocsparse_int* tmp_segm = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; // Index base one requires shift of offset positions if(descr->base == rocsparse_index_base_one) { - tmp_segm = reinterpret_cast(ptr); - ptr += sizeof(rocsparse_int) * nnz; - #define CSRSORT_DIM 512 dim3 csrsort_blocks(m / CSRSORT_DIM + 1); dim3 csrsort_threads(CSRSORT_DIM); @@ -265,7 +264,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, void* tmp_rocprim = reinterpret_cast(ptr); // Switch between offsets - const rocsparse_int* offsets = tmp_segm ? tmp_segm : csr_row_ptr; + const rocsparse_int* offsets = descr->base == rocsparse_index_base_one ? tmp_segm : csr_row_ptr; // Sort by columns and obtain permutation vector From 87363ec011fa0a8145eceb09a39569f811f281ac Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 13 Jul 2018 16:43:06 +0200 Subject: [PATCH 163/304] clang-format --- .../rocsparse_template_specialization.cpp | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 270b3f8d..40034d52 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -301,7 +301,19 @@ rocsparse_status rocsparse_csr2csc(rocsparse_handle handle, rocsparse_index_base idx_base, void* temp_buffer) { - return rocsparse_scsr2csc(handle, m, n, nnz, csr_val, csr_row_ptr, csr_col_ind, csc_val, csc_row_ind, csc_col_ptr, copy_values, idx_base, temp_buffer); + return rocsparse_scsr2csc(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr, + copy_values, + idx_base, + temp_buffer); } template <> @@ -319,7 +331,19 @@ rocsparse_status rocsparse_csr2csc(rocsparse_handle handle, rocsparse_index_base idx_base, void* temp_buffer) { - return rocsparse_dcsr2csc(handle, m, n, nnz, csr_val, csr_row_ptr, csr_col_ind, csc_val, csc_row_ind, csc_col_ptr, copy_values, idx_base, temp_buffer); + return rocsparse_dcsr2csc(handle, + m, + n, + nnz, + csr_val, + csr_row_ptr, + csr_col_ind, + csc_val, + csc_row_ind, + csc_col_ptr, + copy_values, + idx_base, + temp_buffer); } template <> From a91190f53ff878f25b528044fedda23064d201cc Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 18 Jul 2018 17:34:28 +0200 Subject: [PATCH 164/304] bugfix for csrmv that occured when average row nnz exceeded 64 --- library/src/level2/csrmv_device.h | 95 +++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 29 deletions(-) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index dac0cd59..a1aa7728 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -10,23 +10,61 @@ __device__ int __llvm_amdgcn_readlane(int index, int offset) __asm("llvm.amdgcn. #endif #if defined(__HIP_PLATFORM_HCC__) -// Swizzle-based reduction +// Swizzle-based float reduction template __device__ float reduction(float sum) { - // clang-format off - if(SUBWAVE_SIZE > 32) sum += __llvm_amdgcn_readlane(sum, 32); - if(SUBWAVE_SIZE > 16) sum += __hip_ds_swizzle(sum, 0x401f); - if(SUBWAVE_SIZE > 8) sum += __hip_ds_swizzle(sum, 0x201f); - if(SUBWAVE_SIZE > 4) sum += __hip_ds_swizzle(sum, 0x101f); - if(SUBWAVE_SIZE > 2) sum += __hip_ds_swizzle(sum, 0x081f); - if(SUBWAVE_SIZE > 1) sum += __hip_ds_swizzle(sum, 0x041f); - // clang-format on + typedef union flt_b32 + { + float val; + uint32_t b32; + } flt_b32_t; + + flt_b32_t upper_sum; + flt_b32_t temp_sum; + temp_sum.val = sum; + + if(SUBWAVE_SIZE > 1) + { + upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x80b1); + temp_sum.val += upper_sum.val; + } + + if(SUBWAVE_SIZE > 2) + { + upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x804e); + temp_sum.val += upper_sum.val; + } + + if(SUBWAVE_SIZE > 4) + { + upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x101f); + temp_sum.val += upper_sum.val; + } + + if(SUBWAVE_SIZE > 8) + { + upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x201f); + temp_sum.val += upper_sum.val; + } + + if(SUBWAVE_SIZE > 16) + { + upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x401f); + temp_sum.val += upper_sum.val; + } + if(SUBWAVE_SIZE > 32) + { + upper_sum.b32 = __llvm_amdgcn_readlane(temp_sum.b32, 32); + temp_sum.val += upper_sum.val; + } + + sum = temp_sum.val; return sum; } -// Swizzle-based reduction +// Swizzle-based double reduction template __device__ double reduction(double sum) { @@ -38,48 +76,47 @@ __device__ double reduction(double sum) dbl_b32_t upper_sum; dbl_b32_t temp_sum; - temp_sum.val = sum; - if(SUBWAVE_SIZE > 32) + if(SUBWAVE_SIZE > 1) { - upper_sum.b32[0] = __llvm_amdgcn_readlane(temp_sum.b32[0], 32); - upper_sum.b32[1] = __llvm_amdgcn_readlane(temp_sum.b32[1], 32); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x80b1); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x80b1); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 16) + if(SUBWAVE_SIZE > 2) { - upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x401f); - upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x401f); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x804e); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x804e); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 8) + if(SUBWAVE_SIZE > 4) { - upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x201f); - upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x201f); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x101f); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x101f); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 4) + if(SUBWAVE_SIZE > 8) { - upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x101f); - upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x101f); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x201f); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x201f); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 2) + if(SUBWAVE_SIZE > 16) { - upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x081f); - upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x081f); + upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x401f); + upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x401f); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 1) + if(SUBWAVE_SIZE > 32) { - upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x041f); - upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x041f); + upper_sum.b32[0] = __llvm_amdgcn_readlane(temp_sum.b32[0], 32); + upper_sum.b32[1] = __llvm_amdgcn_readlane(temp_sum.b32[1], 32); temp_sum.val += upper_sum.val; } From 18a1411a293c5b1d315bcf56946e1efe77e14629 Mon Sep 17 00:00:00 2001 From: Nico <31079890+ntrost57@users.noreply.github.com> Date: Thu, 19 Jul 2018 11:14:50 +0200 Subject: [PATCH 165/304] updated readme.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5fb8112a..2a298528 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,8 @@ Optional: * Required for tests. * Use GTEST_ROOT to specify GTest location. * If [GTest][] is not found, it will be downloaded and built automatically. -* [Google Benchmark][] +* [Boost][] * Required for benchmarks. - * If [Google Benchmark][] is not found, it will be downloaded and built automatically. ## Quickstart rocSPARSE build and install @@ -21,14 +20,14 @@ Optional: All compiler specifications are determined automatically. The compilation process can be performed by ``` # Clone rocSPARSE using git -git clone https://github.com/ROCmSoftwarePlatform/rocSparse.git +git clone https://github.com/ROCmSoftwarePlatform/rocSPARSE.git # Go to rocSPARSE directory, create and go to the build directory cd rocSPARSE; mkdir build; cd build # Configure rocSPARSE # Build options: -# BUILD_CLIENTS_TESTS - build tests using [GTest][] (OFF) +# BUILD_CLIENTS_TESTS - build tests (OFF) # BUILD_CLIENTS_BENCHMARKS - build benchmarks (OFF) # BUILD_CLIENTS_SAMPLES - build examples (ON) # BUILD_VERBOSE - verbose output (OFF) @@ -46,7 +45,7 @@ make You can also build rocSPARSE using the *install.sh* script ``` # Clone rocSPARSE using git -git clone https://github.com/ROCmSoftwarePlatform/rocSparse.git +git clone https://github.com/ROCmSoftwarePlatform/rocSPARSE.git # Go to rocSPARSE directory cd rocSPARSE @@ -93,5 +92,6 @@ The [license file][] can be found in the main repository. [ROCm]: https://github.com/RadeonOpenCompute/ROCm [HIP]: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/ [GTest]: https://github.com/google/googletest +[Boost]: https://www.boost.org/ [the issue tracker]: https://github.com/ROCmSoftwarePlatform/rocSparse/issues [license file]: ./LICENSE.md From bfeb3a0af28dc7957a5930b18b3000f21ac29eae Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 19 Jul 2018 12:27:33 +0200 Subject: [PATCH 166/304] updated install.sh script --- install.sh | 303 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 213 insertions(+), 90 deletions(-) diff --git a/install.sh b/install.sh index 94775f45..88501517 100755 --- a/install.sh +++ b/install.sh @@ -1,31 +1,7 @@ #!/usr/bin/env bash # Author: Kent Knox -# ################################################# -# Pre-requisites check -# ################################################# -# Exit code 0: alls well -# Exit code 1: problems with getopt -# Exit code 2: problems with supported platforms - -# check if getopt command is installed -type getopt > /dev/null -if [[ $? -ne 0 ]]; then - echo "This script uses getopt to parse arguments; try installing the util-linux package"; - exit 1 -fi - -# lsb-release file describes the system -if [[ ! -e "/etc/lsb-release" ]]; then - echo "This script depends on the /etc/lsb-release file" - exit 2 -fi -source /etc/lsb-release - -if [[ ${DISTRIB_ID} != Ubuntu ]]; then - echo "This script only validated with Ubuntu" - exit 2 -fi +#set -x #echo on # ################################################# # helper functions @@ -35,6 +11,7 @@ function display_help() echo "rocsparse build & installation helper script" echo "./install [-h|--help] " echo " [-h|--help] prints this help message" +# echo " [--prefix] Specify an alternate CMAKE_INSTALL_PREFIX for cmake" echo " [-i|--install] install after build" echo " [-d|--dependencies] install build dependencies" echo " [-c|--clients] build library clients too (combines with -i & -d)" @@ -42,6 +19,31 @@ function display_help() echo " [--cuda] build library for cuda backend" } +supported_distro( ) +{ + if [ -z ${ID+foo} ]; then + printf "supported_distro(): \$ID must be set\n" + exit 2 + fi + + case "${ID}" in + ubuntu|centos|rhel|fedora) + true + ;; + *) printf "This script is currently supported on Ubuntu, CentOS, RHEL and Fedora\n" + exit 2 + ;; + esac +} + +# This function is helpful for dockerfiles that do not have sudo installed, but the default user is root +check_exit_code( ) +{ + if (( $? != 0 )); then + exit $? + fi +} + # This function is helpful for dockerfiles that do not have sudo installed, but the default user is root elevate_if_not_root( ) { @@ -49,16 +51,147 @@ elevate_if_not_root( ) if (( ${uid} )); then sudo $@ + check_exit_code else $@ + check_exit_code + fi +} + +# Take an array of packages as input, and install those packages with 'apt' if they are not already installed +install_apt_packages( ) +{ + package_dependencies=("$@") + for package in "${package_dependencies[@]}"; do + if [[ $(dpkg-query --show --showformat='${db:Status-Abbrev}\n' ${package} 2> /dev/null | grep -q "ii"; echo $?) -ne 0 ]]; then + printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" + elevate_if_not_root apt install -y --no-install-recommends ${package} + fi + done +} + +# Take an array of packages as input, and install those packages with 'yum' if they are not already installed +install_yum_packages( ) +{ + package_dependencies=("$@") + for package in "${package_dependencies[@]}"; do + if [[ $(yum list installed ${package} &> /dev/null; echo $? ) -ne 0 ]]; then + printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" + elevate_if_not_root yum install -y ${package} + fi + done +} + +# Take an array of packages as input, and install those packages with 'dnf' if they are not already installed +install_dnf_packages( ) +{ + package_dependencies=("$@") + for package in "${package_dependencies[@]}"; do + if [[ $(dnf list installed ${package} &> /dev/null; echo $? ) -ne 0 ]]; then + printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" + elevate_if_not_root dnf install -y ${package} + fi + done +} + +# Take an array of packages as input, and delegate the work to the appropriate distro installer +# prereq: ${ID} must be defined before calling +# prereq: ${build_clients} must be defined before calling +install_packages( ) +{ + if [ -z ${ID+foo} ]; then + printf "install_packages(): \$ID must be set\n" + exit 2 + fi + + if [ -z ${build_clients+foo} ]; then + printf "install_packages(): \$build_clients must be set\n" + exit 2 + fi + + # dependencies needed for rocsparse and clients to build + local library_dependencies_ubuntu=( "make" "cmake-curses-gui" "python2.7" "python-yaml" "hip_hcc" "pkg-config" ) + local library_dependencies_centos=( "epel-release" "make" "cmake3" "python34" "PyYAML" "hip_hcc" "gcc-c++" ) + local library_dependencies_fedora=( "make" "cmake" "python34" "PyYAML" "hip_hcc" "gcc-c++" "libcxx-devel" "rpm-build" ) + + if [[ "${build_cuda}" == true ]]; then + # Ideally, this could be cuda-cusparse-dev, but the package name has a version number in it + library_dependencies_ubuntu+=( "cuda" ) + library_dependencies_centos+=( "" ) # how to install cuda on centos? + library_dependencies_fedora+=( "" ) # how to install cuda on fedora? fi + + local client_dependencies_ubuntu=( "gfortran" "libboost-program-options-dev" ) + local client_dependencies_centos=( "gcc-gfortran" "boost-devel" ) + local client_dependencies_fedora=( "gcc-gfortran" "boost-devel" ) + + case "${ID}" in + ubuntu) + elevate_if_not_root apt update + install_apt_packages "${library_dependencies_ubuntu[@]}" + + if [[ "${build_clients}" == true ]]; then + install_apt_packages "${client_dependencies_ubuntu[@]}" + fi + ;; + + centos|rhel) +# yum -y update brings *all* installed packages up to date +# without seeking user approval +# elevate_if_not_root yum -y update + install_yum_packages "${library_dependencies_centos[@]}" + + if [[ "${build_clients}" == true ]]; then + install_yum_packages "${client_dependencies_centos[@]}" + fi + ;; + + fedora) +# elevate_if_not_root dnf -y update + install_dnf_packages "${library_dependencies_fedora[@]}" + + if [[ "${build_clients}" == true ]]; then + install_dnf_packages "${client_dependencies_fedora[@]}" + fi + ;; + *) + echo "This script is currently supported on Ubuntu, CentOS, RHEL and Fedora" + exit 2 + ;; + esac } +# ################################################# +# Pre-requisites check +# ################################################# +# Exit code 0: alls well +# Exit code 1: problems with getopt +# Exit code 2: problems with supported platforms + +# check if getopt command is installed +type getopt > /dev/null +if [[ $? -ne 0 ]]; then + echo "This script uses getopt to parse arguments; try installing the util-linux package"; + exit 1 +fi + +# os-release file describes the system +if [[ -e "/etc/os-release" ]]; then + source /etc/os-release +else + echo "This script depends on the /etc/os-release file" + exit 2 +fi + +# The following function exits script if an unsupported distro is detected +supported_distro + # ################################################# # global variables # ################################################# install_package=false install_dependencies=false +install_prefix=rocsparse-install build_clients=false build_cuda=false build_release=true @@ -104,6 +237,9 @@ while true; do --cuda) build_cuda=true shift ;; + --prefix) + install_prefix=${2} + shift 2 ;; --) shift ; break ;; *) echo "Unexpected command line parameter received; aborting"; exit 1 @@ -124,53 +260,34 @@ else rm -rf ${build_dir}/debug fi +# Default cmake executable is called cmake +cmake_executable=cmake + +case "${ID}" in + centos|rhel) + cmake_executable=cmake3 + ;; +esac + # ################################################# -# install build dependencies on request +# dependencies # ################################################# if [[ "${install_dependencies}" == true ]]; then - # dependencies needed for rocsparse and clients to build - library_dependencies_ubuntu=( "make" "cmake-curses-gui" "hip_hcc" "pkg-config" ) - if [[ "${build_cuda}" == false ]]; then - library_dependencies_ubuntu+=( "hcc" ) - else - # Ideally, this could be cuda-cusparse-dev, but the package name has a version number in it - library_dependencies_ubuntu+=( "cuda" ) - fi - - client_dependencies_ubuntu=( "libboost-program-options-dev" ) - - elevate_if_not_root apt update - - # Dependencies required by main library - for package in "${library_dependencies_ubuntu[@]}"; do - if [[ $(dpkg-query --show --showformat='${db:Status-Abbrev}\n' ${package} 2> /dev/null | grep -q "ii"; echo $?) -ne 0 ]]; then - printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" - elevate_if_not_root apt install -y --no-install-recommends ${package} - fi - done - # Dependencies required by library client apps - if [[ "${build_clients}" == true ]]; then - for package in "${client_dependencies_ubuntu[@]}"; do - if [[ $(dpkg-query --show --showformat='${db:Status-Abbrev}\n' ${package} 2> /dev/null | grep -q "ii"; echo $?) -ne 0 ]]; then - printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" - elevate_if_not_root apt install -y --no-install-recommends ${package} - fi - done - - # The following builds googletest from source - pushd . - printf "\033[32mBuilding \033[33mgoogletest\033[32m from source" - mkdir -p ${build_dir}/deps && cd ${build_dir}/deps - cmake -DBUILD_BOOST=OFF -DCMAKE_INSTALL_PREFIX=deps-install ../../deps - make -j$(nproc) - # elevate_if_not_root make install - make install - popd - fi + install_packages + # The following builds googletest from source + pushd . + printf "\033[32mBuilding \033[33mgoogletest\033[32m from source" + mkdir -p ${build_dir}/deps && cd ${build_dir}/deps + ${cmake_executable} -DBUILD_BOOST=OFF ../../deps + make -j$(nproc) + elevate_if_not_root make install + popd fi +# We append customary rocm path; if user provides custom rocm path in ${path}, our +# hard-coded path has lesser priority export PATH=${PATH}:/opt/rocm/bin pushd . @@ -191,34 +308,27 @@ pushd . # clients if [[ "${build_clients}" == true ]]; then - cmake_client_options="${cmake_client_options} -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCHMARKS=ON -DBUILD_CLIENTS_SELFTEST=ON -DBUILD_CLIENTS_RIDER=ON" + cmake_client_options="${cmake_client_options} -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCHMARKS=ON" fi - # On ROCm platforms, hcc compiler can build everything - if [[ "${build_cuda}" == false ]]; then - CXX=hcc cmake ${cmake_common_options} ${cmake_client_options} -DCMAKE_PREFIX_PATH="$(pwd)/../deps/deps-install" ../.. - make -j$(nproc) + compiler="hcc" + if [[ "${build_cuda}" == true ]]; then + compiler="hipcc" + fi + + # Uncomment for cmake debugging + # CXX=${compiler} ${cmake_executable} -Wdev --debug-output --trace ${cmake_common_options} -DCPACK_SET_DESTDIR=OFF -DCMAKE_INSTALL_PREFIX=rocsparse-install -DCPACK_PACKAGING_INSTALL_PREFIX=/opt/rocm ../.. + + # Build library with AMD toolchain because of existense of device kernels + if [[ "${build_clients}" == true ]]; then + CXX=${compiler} ${cmake_executable} ${cmake_common_options} ${cmake_client_options} -DCPACK_SET_DESTDIR=OFF -DCMAKE_INSTALL_PREFIX=rocsparse-install -DCPACK_PACKAGING_INSTALL_PREFIX=/opt/rocm ../.. else - # The nvidia compile is a little more complicated, in that we split compiling the library from the clients - # We use the hipcc compiler to build the rocsparse library for a cuda backend (hipcc offloads the compile to nvcc) - # However, we run into a compiler incompatibility compiling the clients between nvcc and sparsew3.h 3.3.4 headers. - # The incompatibility is fixed in sparse v3.3.6, but that is not shipped by default on Ubuntu - # As a workaround, since clients do not contain device code, we opt to build clients with the native - # compiler on the platform. The compiler cmake chooses during configuration time is mostly unchangeable, - # so we launch multiple cmake invocation with a different compiler on each. - - # Build library only with hipcc as compiler - CXX=hipcc cmake ${cmake_common_options} -DCMAKE_INSTALL_PREFIX=rocsparse-install -DCPACK_PACKAGE_INSTALL_DIRECTORY=/opt/rocm ../.. - make -j$(nproc) install - - # Build cuda clients with default host compiler - if [[ "${build_clients}" == true ]]; then - pushd clients - cmake ${cmake_common_options} ${cmake_client_options} -DCMAKE_PREFIX_PATH="$(pwd)/../rocsparse-install;$(pwd)/../deps/deps-install" ../../../clients - make -j$(nproc) - popd - fi + CXX=${compiler} ${cmake_executable} ${cmake_common_options} -DCPACK_SET_DESTDIR=OFF -DCMAKE_INSTALL_PREFIX=rocsparse-install -DCPACK_PACKAGING_INSTALL_PREFIX=/opt/rocm ../.. fi + check_exit_code + + make -j$(nproc) install + check_exit_code # ################################################# # install @@ -226,6 +336,19 @@ pushd . # installing through package manager, which makes uninstalling easy if [[ "${install_package}" == true ]]; then make package - elevate_if_not_root dpkg -i rocsparse-*.deb + check_exit_code + + case "${ID}" in + ubuntu) + elevate_if_not_root dpkg -i rocsparse-*.deb + ;; + centos|rhel) + elevate_if_not_root yum localinstall rocsparse-*.rpm + ;; + fedora) + elevate_if_not_root dnf install rocsparse-*.rpm + ;; + esac + fi popd From 83e16a7f6c55ced3608a72633b8389ee50229fcf Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 20 Jul 2018 08:28:50 +0200 Subject: [PATCH 167/304] initial csrmv_adaptive commit --- library/include/rocsparse-auxiliary.h | 16 + library/include/rocsparse-functions.h | 95 +++++ library/include/rocsparse-types.h | 1 + library/src/CMakeLists.txt | 1 + library/src/include/handle.h | 15 + library/src/level2/csrmv_adaptive_device.h | 367 ++++++++++++++++++ .../src/level2/rocsparse_csrmv_adaptive.cpp | 305 +++++++++++++++ .../src/level2/rocsparse_csrmv_adaptive.hpp | 242 ++++++++++++ library/src/rocsparse_auxiliary.cpp | 75 ++++ 9 files changed, 1117 insertions(+) create mode 100644 library/src/level2/csrmv_adaptive_device.h create mode 100644 library/src/level2/rocsparse_csrmv_adaptive.cpp create mode 100644 library/src/level2/rocsparse_csrmv_adaptive.hpp diff --git a/library/include/rocsparse-auxiliary.h b/library/include/rocsparse-auxiliary.h index 31f2f879..d5d12743 100644 --- a/library/include/rocsparse-auxiliary.h +++ b/library/include/rocsparse-auxiliary.h @@ -127,6 +127,22 @@ rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat* hyb); ROCSPARSE_EXPORT rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb); +/******************************************************************************** + * \brief rocsparse_create_csrmv_info is a structure holding the rocsparse + * csrmv info data gathered during csrmv_analysis. It must be initialized using + * rocsparse_create_csrmv_info() and the retured info structure must be passed + * to all subsequent csrmv adaptive function calls. It should be destroyed at + * the end using rocsparse_destroy_csrmv_info(). + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info); + +/******************************************************************************** + * \brief Destroy csrmv info. + *******************************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info); + #ifdef __cplusplus } #endif diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 97bd365a..d2a58c78 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -528,6 +528,47 @@ rocsparse_status rocsparse_zcoomv(rocsparse_handle handle, rocsparse_double_complex* y); */ +/*! \brief SPARSE Level 2 API + + \details + csrmv_analysis performs the analysis for csrmv adaptive algorithm. + It is expected that this function will be executed only once for a + given matrix and particular operation type. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + trans operation type of A. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + nnz number of non-zero entries of A. + @param[in] + descr descriptor of A. + @param[in] + csr_row_ptr array of m+1 elements that point to the start + of every row of A. + @param[in] + csr_col_ind array of nnz elements containing the column indices of A. + @param[out] + info structure that holds the information collected during + the analysis phase. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_csrmv_info info); + /*! \brief SPARSE Level 2 API \details @@ -630,6 +671,60 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, rocsparse_double_complex* y); */ + + + + + + + + + + + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsrmv_adaptive(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const float* x, + const float* beta, + float* y, + const rocsparse_csrmv_info info); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsrmv_adaptive(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const double* x, + const double* beta, + double* y, + const rocsparse_csrmv_info info); + + + + + + + + + + + + /*! \brief SPARSE Level 2 API \details diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index 85cf5169..420dc0f3 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -23,6 +23,7 @@ typedef int32_t rocsparse_int; typedef struct _rocsparse_handle* rocsparse_handle; typedef struct _rocsparse_mat_descr* rocsparse_mat_descr; typedef struct _rocsparse_hyb_mat* rocsparse_hyb_mat; +typedef struct _rocsparse_csrmv_info* rocsparse_csrmv_info; #ifdef __cplusplus extern "C" { diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index e6f77889..44783335 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -20,6 +20,7 @@ set(rocsparse_source # Level2 src/level2/rocsparse_coomv.cpp src/level2/rocsparse_csrmv.cpp + src/level2/rocsparse_csrmv_adaptive.cpp src/level2/rocsparse_ellmv.cpp src/level2/rocsparse_hybmv.cpp diff --git a/library/src/include/handle.h b/library/src/include/handle.h index bba65ce7..6f28fc77 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -100,6 +100,21 @@ struct _rocsparse_hyb_mat void* coo_val = nullptr; }; +/******************************************************************************** + * \brief rocsparse_create_csrmv_info is a structure holding the rocsparse + * csrmv info data gathered during csrmv_analysis. It must be initialized using + * rocsparse_create_csrmv_info() and the retured info structure must be passed + * to all subsequent csrmv adaptive function calls. It should be destroyed at + * the end using rocsparse_destroy_csrmv_info(). + *******************************************************************************/ + struct _rocsparse_csrmv_info + { + // num row blocks + size_t size = 0; + // row blocks + unsigned long long* row_blocks = nullptr; + }; + /******************************************************************************** * \brief ELL format indexing *******************************************************************************/ diff --git a/library/src/level2/csrmv_adaptive_device.h b/library/src/level2/csrmv_adaptive_device.h new file mode 100644 index 00000000..2abf254e --- /dev/null +++ b/library/src/level2/csrmv_adaptive_device.h @@ -0,0 +1,367 @@ +#pragma once +#ifndef CSRMV_ADAPTIVE_DEVICE_H +#define CSRMV_ADAPTIVE_DEVICE_H + +#include + +static inline __device__ float atomic_add_float_extended(float* ptr, float temp, float* old_sum) +{ + return atomicAdd(ptr, temp); +} + +static inline __device__ double atomic_add_float_extended(double* ptr, double temp, double* old_sum) +{ + unsigned long long newVal; + unsigned long long prevVal; + do + { + prevVal = __double_as_longlong(*ptr); + newVal = __double_as_longlong(temp + *ptr); + } while(atomicCAS((unsigned long long*)ptr, prevVal, newVal) != prevVal); + if(old_sum != 0) + *old_sum = (double)prevVal; + return (double)newVal; +} + +template +static inline __device__ T +sum2_reduce(T cur_sum, T* partial, int lid, int max_size, int reduc_size) +{ + if(max_size > reduc_size) + { + cur_sum += partial[lid + reduc_size]; + __syncthreads(); + partial[lid] = cur_sum; + } + return cur_sum; +} + +template +__device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, + T alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* x, + T beta, + T* y, + rocsparse_index_base idx_base) +{ + __shared__ T partialSums[BLOCKSIZE]; + unsigned int gid = hipBlockIdx_x; + unsigned int lid = hipThreadIdx_x; + + // The row blocks buffer holds a packed set of information used to inform each + // workgroup about how to do its work: + // + // |6666 5555 5555 5544 4444 4444 3333 3333|3322 2222|2222 1111 1111 1100 0000 0000| + // |3210 9876 5432 1098 7654 3210 9876 5432|1098 7654|3210 9876 5432 1098 7654 3210| + // |------------Row Information------------|--------^|---WG ID within a long row---| + // | | flag/|or # reduce threads for short| + // + // The upper 32 bits of each rowBlock entry tell the workgroup the ID of the first + // row it will be working on. When one workgroup calculates multiple rows, this + // rowBlock entry and the next one tell it the range of rows to work on. + // The lower 24 bits are used whenever multiple workgroups calculate a single long + // row. This tells each workgroup its ID within that row, so it knows which + // part of the row to operate on. + // Alternately, on short row blocks, the lower bits are used to communicate + // the number of threads that should be used for the reduction. Pre-calculating + // this on the CPU-side results in a noticable performance uplift on many matrices. + // Bit 24 is a flag bit used so that the multiple WGs calculating a long row can + // know when the first workgroup for that row has finished initializing the output + // value. While this bit is the same as the first workgroup's flag bit, this + // workgroup will spin-loop. + unsigned int row = ((row_blocks[gid] >> (64 - ROW_BITS)) & ((1ULL << ROW_BITS) - 1ULL)); + unsigned int stop_row = + ((row_blocks[gid + 1] >> (64 - ROW_BITS)) & ((1ULL << ROW_BITS) - 1ULL)); + unsigned int num_rows = stop_row - row; + + // Get the workgroup within this long row ID out of the bottom bits of the row block. + unsigned int wg = row_blocks[gid] & ((1 << WG_BITS) - 1); + + // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row. + // If there are more items in this row, we assign more workgroups. +// unsigned int vecStart = hc::__mad24(wg, (unsigned int)(BLOCK_MULTIPLIER * BLOCKSIZE), (unsigned int)csr_row_ptr[row]); + unsigned int vecStart = ((wg >> 8) << 8) * (((unsigned int)(BLOCK_MULTIPLIER * BLOCKSIZE) >> 8) << 8) + csr_row_ptr[row]; + unsigned int vecEnd = (csr_row_ptr[row + 1] > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) + ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE + : csr_row_ptr[row + 1]; + + T temp_sum = 0.; + + // If the next row block starts more than 2 rows away, then we choose CSR-Stream. + // If this is zero (long rows) or one (final workgroup in a long row, or a single + // row in a row block), we want to use the CSR-Vector algorithm(s). + // We have found, through experimentation, that CSR-Vector is generally faster + // when working on 2 rows, due to its simplicity and better reduction method. + if(num_rows > ROWS_FOR_VECTOR) + { + // CSR-Stream case. See Sections III.A and III.B in the SC'14 paper: + // Efficient Sparse Matrix-Vector Multiplication on GPUs using the CSR Storage Format + // for a detailed description of CSR-Stream. + // In a nutshell, the idea is to use all of the threads to stream the matrix + // values into the local memory in a fast, coalesced manner. After that, the + // per-row reductions are done out of the local memory, which is designed + // to handle non-coalsced accesses. + + // The best method for reducing the local memory values depends on the number + // of rows. The SC'14 paper discusses a CSR-Scalar style reduction where + // each thread reduces its own row. This yields good performance if there + // are many (relatively short) rows. However, if they are few (relatively + // long) rows, it's actually better to perform a tree-style reduction where + // multiple threads team up to reduce the same row. + + // The calculation below tells you how many threads this workgroup can allocate + // to each row, assuming that every row gets the same number of threads. + // We want the closest lower (or equal) power-of-2 to this number -- + // that is how many threads can work in each row's reduction using our algorithm. + // For instance, with workgroup size 256, 2 rows = 128 threads, 3 rows = 64 + // threads, 4 rows = 64 threads, 5 rows = 32 threads, etc. + // int numThreadsForRed = get_local_size(0) >> ((CHAR_BIT*sizeof(unsigned + // int))-clz(num_rows-1)); + unsigned int numThreadsForRed = wg; // Same calculation as above, done on host. + + // Stream all of this row block's matrix values into local memory. + // Perform the matvec in parallel with this work. + unsigned int col = csr_row_ptr[row] + lid; + if(gid != (gridDim.x - 1)) + { + for(int i = 0; i < BLOCKSIZE; i += WG_SIZE) + partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i]]; + } + else + { + // This is required so that we stay in bounds for csr_val[] and csr_col_ind[]. + // Otherwise, if the matrix's endpoints don't line up with BLOCKSIZE, + // we will buffer overflow. On today's dGPUs, this doesn't cause problems. + // The values are within a dGPU's page, which is zeroed out on allocation. + // However, this may change in the future (e.g. with shared virtual memory.) + // This causes a minor performance loss because this is the last workgroup + // to be launched, and this loop can't be unrolled. + for(int i = 0; col + i < csr_row_ptr[stop_row]; i += WG_SIZE) + partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i]]; + } + __syncthreads(); + + if(numThreadsForRed > 1) + { + // In this case, we want to have the workgroup perform a tree-style reduction + // of each row. {numThreadsForRed} adjacent threads team up to linearly reduce + // a row into {numThreadsForRed} locations in local memory. + // After that, the entire workgroup does a parallel reduction, and each + // row ends up with an individual answer. + + // {numThreadsForRed} adjacent threads all work on the same row, so their + // start and end values are the same. + // numThreadsForRed guaranteed to be a power of two, so the clz code below + // avoids an integer divide. ~2% perf gain in EXTRA_PRECISION. + // size_t st = lid/numThreadsForRed; + unsigned int local_row = row + (lid >> (31 - __clz(numThreadsForRed))); + unsigned int local_first_val = csr_row_ptr[local_row] - csr_row_ptr[row]; + unsigned int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; + unsigned int threadInBlock = lid & (numThreadsForRed - 1); + + // Not all row blocks are full -- they may have an odd number of rows. As such, + // we need to ensure that adjacent-groups only work on real data for this rowBlock. + if(local_row < stop_row) + { + // This is dangerous -- will infinite loop if your last value is within + // numThreadsForRed of MAX_UINT. Noticable performance gain to avoid a + // long induction variable here, though. + for(unsigned int local_cur_val = local_first_val + threadInBlock; + local_cur_val < local_last_val; + local_cur_val += numThreadsForRed) + temp_sum += partialSums[local_cur_val]; + } + __syncthreads(); + + partialSums[lid] = temp_sum; + + // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed} + // values sitting in the local memory. This means that, roughly, the beginning of + // LDS is full up to {workgroup size} entries. + // Now we perform a parallel reduction that sums together the answers for each + // row in parallel, leaving us an answer in 'temp_sum' for each row. + for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + __syncthreads(); + temp_sum = sum2_reduce(temp_sum, partialSums, lid, numThreadsForRed, i); + } + + if(threadInBlock == 0 && local_row < stop_row) + { + // All of our write-outs check to see if the output vector should first be zeroed. + // If so, just do a write rather than a read-write. Measured to be a slight (~5%) + // performance improvement. + if(beta != 0.) + temp_sum += beta * y[local_row]; + y[local_row] = temp_sum; + } + } + else + { + // In this case, we want to have each thread perform the reduction for a single row. + // Essentially, this looks like performing CSR-Scalar, except it is computed out of + // local memory. + // However, this reduction is also much faster than CSR-Scalar, because local memory + // is designed for scatter-gather operations. + // We need a while loop because there may be more rows than threads in the WG. + unsigned int local_row = row + lid; + while(local_row < stop_row) + { + int local_first_val = (csr_row_ptr[local_row] - csr_row_ptr[row]); + int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; + temp_sum = 0.; + for(int local_cur_val = local_first_val; local_cur_val < local_last_val; + local_cur_val++) + temp_sum += partialSums[local_cur_val]; + + // After you've done the reduction into the temp_sum register, + // put that into the output for each row. + if(beta != 0.) + temp_sum += beta * y[local_row]; + y[local_row] = temp_sum; + local_row += WG_SIZE; + } + } + } + else if(num_rows >= 1 && !wg) // CSR-Vector case. + { + // ^^ The above check says that if this workgroup is supposed to work on <= ROWS_VECTOR + // number of rows then we should do the CSR-Vector algorithm. If we want this row to be + // done with CSR-LongRows, then all of its workgroups (except the last one) will have the + // same stop_row and row. The final workgroup in a LongRow will have stop_row and row + // different, but the internal wg number will be non-zero. + + // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small + // numbers of rows), then it needs to iterate until it reaches the stop_row. + // We don't check <= stop_row because of the potential for unsigned overflow. + while(row < stop_row) + { + // Any workgroup only calculates, at most, BLOCKSIZE items in this row. + // If there are more items in this row, we use CSR-LongRows. + temp_sum = 0.; + vecStart = csr_row_ptr[row]; + vecEnd = csr_row_ptr[row + 1]; + + // Load in a bunch of partial results into your register space, rather than LDS (no + // contention) + // Then dump the partially reduced answers into the LDS for inter-work-item reduction. + // Using a long induction variable to make sure unsigned int overflow doesn't break + // things. + for(unsigned long long j = vecStart + lid; j < vecEnd; j += WG_SIZE) + { + unsigned int col = csr_col_ind[(unsigned int)j]; + temp_sum += alpha * csr_val[(unsigned int)j] * x[col]; + } + + partialSums[lid] = temp_sum; + + // Reduce partial sums + for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + __syncthreads(); + temp_sum = sum2_reduce(temp_sum, partialSums, lid, WG_SIZE, i); + } + + if(lid == 0U) + { + if(beta != 0.) + temp_sum += beta * y[row]; + y[row] = temp_sum; + } + row++; + } + } + else + { + // In CSR-LongRows, we have more than one workgroup calculating this row. + // The output values for those types of rows are stored using atomic_add, because + // more than one parallel workgroup's value makes up the final answer. + // Unfortunately, this makes it difficult to do y=Ax, rather than y=Ax+y, because + // the values still left in y will be added in using the atomic_add. + // + // Our solution is to have the first workgroup in one of these long-rows cases + // properly initaizlie the output vector. All the other workgroups working on this + // row will spin-loop until that workgroup finishes its work. + + // First, figure out which workgroup you are in the row. Bottom 24 bits. + // You can use that to find the global ID for the first workgroup calculating + // this long row. + unsigned int first_wg_in_row = gid - (row_blocks[gid] & ((1ULL << WG_BITS) - 1ULL)); + unsigned int compare_value = row_blocks[gid] & (1ULL << WG_BITS); + + // Bit 24 in the first workgroup is the flag that everyone waits on. + if(gid == first_wg_in_row && lid == 0ULL) + { + // The first workgroup handles the output initialization. + T out_val = y[row]; + temp_sum = (beta - 1.) * out_val; + atomicXor(&row_blocks[first_wg_in_row], (1ULL << WG_BITS)); // Release other workgroups. + } + // For every other workgroup, bit 24 holds the value they wait on. + // If your bit 24 == first_wg's bit 24, you spin loop. + // The first workgroup will eventually flip this bit, and you can move forward. + __syncthreads(); + while( + gid != first_wg_in_row && lid == 0U && + ((atomicMax(&row_blocks[first_wg_in_row], 0ULL) & (1ULL << WG_BITS)) == compare_value)) + ; + __syncthreads(); + + // After you've passed the barrier, update your local flag to make sure that + // the next time through, you know what to wait on. + if(gid != first_wg_in_row && lid == 0ULL) + row_blocks[gid] ^= (1ULL << WG_BITS); + + // All but the final workgroup in a long-row collaboration have the same start_row + // and stop_row. They only run for one iteration. + // Load in a bunch of partial results into your register space, rather than LDS (no + // contention) + // Then dump the partially reduced answers into the LDS for inter-work-item reduction. + unsigned int col = vecStart + lid; + if(row == stop_row) // inner thread, we can hardcode/unroll this loop + { + // Don't put BLOCK_MULTIPLIER*BLOCKSIZE as the stop point, because + // some GPU compilers will *aggressively* unroll this loop. + // That increases register pressure and reduces occupancy. + for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) + { + temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; +#if 2 * WG_SIZE <= BLOCK_MULTIPLIER * BLOCKSIZE + // If you can, unroll this loop once. It somewhat helps performance. + j += WG_SIZE; + temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; +#endif + } + } + else + { + for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) + temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; + } + + partialSums[lid] = temp_sum; + + // Reduce partial sums + for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + __syncthreads(); + temp_sum = sum2_reduce(temp_sum, partialSums, lid, WG_SIZE, i); + } + + if(lid == 0U) + { + atomic_add_float_extended(&y[row], temp_sum, 0); + } + } +} + +#endif // CSRMV_ADAPTIVE_DEVICE_H diff --git a/library/src/level2/rocsparse_csrmv_adaptive.cpp b/library/src/level2/rocsparse_csrmv_adaptive.cpp new file mode 100644 index 00000000..dadc54e3 --- /dev/null +++ b/library/src/level2/rocsparse_csrmv_adaptive.cpp @@ -0,0 +1,305 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "definitions.h" +#include "rocsparse.h" +#include "rocsparse_csrmv_adaptive.hpp" + +__attribute__((unused)) +static unsigned int flp2(unsigned int x) +{ + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + return x - (x >> 1); +} + +// Short rows in CSR-Adaptive are batched together into a single row block. +// If there are a relatively small number of these, then we choose to do +// a horizontal reduction (groups of threads all reduce the same row). +// If there are many threads (e.g. more threads than the maximum size +// of our workgroup) then we choose to have each thread serially reduce +// the row. +// This function calculates the number of threads that could team up +// to reduce these groups of rows. For instance, if you have a +// workgroup size of 256 and 4 rows, you could have 64 threads +// working on each row. If you have 5 rows, only 32 threads could +// reliably work on each row because our reduction assumes power-of-2. +static unsigned long long numThreadsForReduction(unsigned long long num_rows) +{ +#if defined(__INTEL_COMPILER) + return WG_SIZE >> (_bit_scan_reverse(num_rows - 1) + 1); +#elif(defined(__HIP_PLATFORM_NVCC__)) + return flp2(WG_SIZE / num_rows); +#elif(defined(__clang__) && __has_builtin(__builtin_clz)) || \ + !defined(__clang) && defined(__GNUG__) && \ + ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 30202) + return (WG_SIZE >> (8 * sizeof(int) - __builtin_clz(num_rows - 1))); +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) + unsigned long long bit_returned; + _BitScanReverse(&bit_returned, (num_rows - 1)); + return WG_SIZE >> (bit_returned + 1); +#else + return flp2(WG_SIZE / num_rows); +#endif +} + +static void ComputeRowBlocks(unsigned long long* rowBlocks, + size_t& rowBlockSize, + const int* rowDelimiters, + int nRows, + bool allocate_row_blocks = true) +{ + unsigned long long* rowBlocksBase; + int total_row_blocks = 1; // Start at one because of rowBlock[0] + + if(allocate_row_blocks) + { + rowBlocksBase = rowBlocks; + *rowBlocks = 0; + rowBlocks++; + } + unsigned long long sum = 0; + unsigned long long i, last_i = 0; + + // Check to ensure nRows can fit in 32 bits + if((unsigned long long)nRows > (unsigned long long)std::pow(2, ROW_BITS)) + { + fprintf(stderr, "nrow does not fit in 32 bits\n"); + exit(1); + } + + int consecutive_long_rows = 0; + for(i = 1; i <= (unsigned long long)nRows; i++) + { + int row_length = (rowDelimiters[i] - rowDelimiters[i - 1]); + sum += row_length; + + // The following section of code calculates whether you're moving between + // a series of "short" rows and a series of "long" rows. + // This is because the reduction in CSR-Adaptive likes things to be + // roughly the same length. Long rows can be reduced horizontally. + // Short rows can be reduced one-thread-per-row. Try not to mix them. + if(row_length > 128) + consecutive_long_rows++; + else if(consecutive_long_rows > 0) + { + // If it turns out we WERE in a long-row region, cut if off now. + if(row_length < 32) // Now we're in a short-row region + consecutive_long_rows = -1; + else + consecutive_long_rows++; + } + + // If you just entered into a "long" row from a series of short rows, + // then we need to make sure we cut off those short rows. Put them in + // their own workgroup. + if(consecutive_long_rows == 1) + { + // Assuming there *was* a previous workgroup. If not, nothing to do here. + if(i - last_i > 1) + { + if(allocate_row_blocks) + { + *rowBlocks = ((i - 1) << (64 - ROW_BITS)); + // If this row fits into CSR-Stream, calculate how many rows + // can be used to do a parallel reduction. + // Fill in the low-order bits with the numThreadsForRed + if(((i - 1) - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction((i - 1) - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i - 1; + sum = row_length; + } + } + else if(consecutive_long_rows == -1) + { + // We see the first short row after some long ones that + // didn't previously fill up a row block. + if(allocate_row_blocks) + { + *rowBlocks = ((i - 1) << (64 - ROW_BITS)); + if(((i - 1) - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction((i - 1) - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i - 1; + sum = row_length; + consecutive_long_rows = 0; + } + + // Now, what's up with this row? What did it do? + + // exactly one row results in non-zero elements to be greater than blockSize + // This is csr-vector case; bottom WGBITS == workgroup ID + if((i - last_i == 1) && sum > (unsigned long long)BLOCKSIZE) + { + int numWGReq = + static_cast(std::ceil((double)row_length / (BLOCK_MULTIPLIER * BLOCKSIZE))); + + // Check to ensure #workgroups can fit in WGBITS bits, if not + // then the last workgroup will do all the remaining work + numWGReq = (numWGReq < (int)std::pow(2, WG_BITS)) ? numWGReq : (int)std::pow(2, WG_BITS); + + if(allocate_row_blocks) + { + for(int w = 1; w < numWGReq; w++) + { + *rowBlocks = ((i - 1) << (64 - ROW_BITS)); + *rowBlocks |= static_cast(w); + rowBlocks++; + } + *rowBlocks = (i << (64 - ROW_BITS)); + rowBlocks++; + } + total_row_blocks += numWGReq; + last_i = i; + sum = 0; + consecutive_long_rows = 0; + } + // more than one row results in non-zero elements to be greater than blockSize + // This is csr-stream case; bottom WGBITS = number of parallel reduction threads + else if((i - last_i > 1) && sum > (unsigned long long)BLOCKSIZE) + { + i--; // This row won't fit, so back off one. + if(allocate_row_blocks) + { + *rowBlocks = (i << (64 - ROW_BITS)); + if((i - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i; + sum = 0; + consecutive_long_rows = 0; + } + // This is csr-stream case; bottom WGBITS = number of parallel reduction threads + else if(sum == (unsigned long long)BLOCKSIZE) + { + if(allocate_row_blocks) + { + *rowBlocks = (i << (64 - ROW_BITS)); + if((i - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i; + sum = 0; + consecutive_long_rows = 0; + } + } + + // If we didn't fill a row block with the last row, make sure we don't lose it. + if(allocate_row_blocks && (*(rowBlocks - 1) >> (64 - ROW_BITS)) != static_cast(nRows)) + { + *rowBlocks = (static_cast(nRows) << (64 - ROW_BITS)); + if((nRows - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); + rowBlocks++; + } + total_row_blocks++; + + if(allocate_row_blocks) + { + size_t dist = std::distance(rowBlocksBase, rowBlocks); + assert((2 * dist) <= rowBlockSize); + // Update the size of rowBlocks to reflect the actual amount of memory used + // We're multiplying the size by two because the extended precision form of + // CSR-Adaptive requires more space for the final global reduction. + rowBlockSize = 2 * dist; + } + else + rowBlockSize = 2 * total_row_blocks; +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_csrmv_info info) +{ + // row blocks size + info->size = 0; + + // Temporary arrays to hold device data + std::vector hptr(m + 1); + RETURN_IF_HIP_ERROR(hipMemcpy(hptr.data(), csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToHost)); + + // Determine row blocks array size + ComputeRowBlocks((unsigned long long*)NULL, info->size, hptr.data(), m, false); + + // Create row blocks structure + std::vector row_blocks(info->size, 0); + + ComputeRowBlocks(row_blocks.data(), + info->size, + hptr.data(), + m, + true); + +printf("Required buffer size: %lu kByte\n", info->size * sizeof(unsigned long long) >> 10); + + // Allocate memory on device to hold csrmv info + RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->row_blocks, sizeof(unsigned long long) * info->size)); + + // Copy row blocks information to device + RETURN_IF_HIP_ERROR(hipMemcpy(info->row_blocks, row_blocks.data(), sizeof(unsigned long long) * info->size, hipMemcpyHostToDevice)); + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_scsrmv_adaptive(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const float* x, + const float* beta, + float* y, + const rocsparse_csrmv_info info) +{ + return rocsparse_csrmv_adaptive_template( + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); +} + +extern "C" rocsparse_status rocsparse_dcsrmv_adaptive(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const double* x, + const double* beta, + double* y, + const rocsparse_csrmv_info info) +{ + return rocsparse_csrmv_adaptive_template( + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); +} diff --git a/library/src/level2/rocsparse_csrmv_adaptive.hpp b/library/src/level2/rocsparse_csrmv_adaptive.hpp new file mode 100644 index 00000000..a7f81da4 --- /dev/null +++ b/library/src/level2/rocsparse_csrmv_adaptive.hpp @@ -0,0 +1,242 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_CSRMV_ADAPTIVE_HPP +#define ROCSPARSE_CSRMV_ADAPTIVE_HPP + +#include "rocsparse.h" +#include "handle.h" +#include "utility.h" +#include "csrmv_adaptive_device.h" + +#include + +#define BLOCKSIZE 1024 +#define BLOCK_MULTIPLIER 3 +#define ROWS_FOR_VECTOR 1 +#define WG_BITS 24 +#define ROW_BITS 32 +#define WG_SIZE 256 + +template +__launch_bounds__(WG_SIZE) +__global__ void csrmvn_adaptive_kernel_host_pointer(unsigned long long* __restrict__ row_blocks, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + T beta, + T* __restrict__ y, + rocsparse_index_base idx_base) +{ + csrmvn_adaptive_device( + row_blocks, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); +} + +template +__launch_bounds__(WG_SIZE) +__global__ void csrmvn_adaptive_kernel_device_pointer(unsigned long long* __restrict__ row_blocks, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + const T* beta, + T* __restrict__ y, + rocsparse_index_base idx_base) +{ + csrmvn_adaptive_device( + row_blocks, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); +} + +template +rocsparse_status rocsparse_csrmv_adaptive_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* x, + const T* beta, + T* y, + const rocsparse_csrmv_info info) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xcsrmv_adaptive"), + trans, + m, + n, + nnz, + *alpha, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)x, + *beta, + (const void*&)y, + (const void*&)info); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xcsrmv_adaptive"), + trans, + m, + n, + nnz, + (const void*&)alpha, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)x, + (const void*&)beta, + (const void*&)y, + (const void*&)info); + } + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(beta == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + // Run different csrmv kernels + if(trans == rocsparse_operation_none) + { + dim3 csrmvn_adaptive_blocks((info->size / 2) - 1); + dim3 csrmvn_adaptive_threads(WG_SIZE); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + hipLaunchKernelGGL((csrmvn_adaptive_kernel_device_pointer), + csrmvn_adaptive_blocks, + csrmvn_adaptive_threads, + 0, + stream, + info->row_blocks, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else + { + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + + hipLaunchKernelGGL((csrmvn_adaptive_kernel_host_pointer), + csrmvn_adaptive_blocks, + csrmvn_adaptive_threads, + 0, + stream, + info->row_blocks, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + } + else + { + //TODO + return rocsparse_status_not_implemented; + } + + return rocsparse_status_success; +} + +#endif // ROCSPARSE_CSRMV_ADAPTIVE_HPP diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index 27f90b15..9c6afe37 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -283,6 +283,30 @@ rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb) // Destruct try { + // Clean up ELL part + if(hyb->ell_col_ind != nullptr) + { + hipFree(hyb->ell_col_ind); + } + if(hyb->ell_val != nullptr) + { + hipFree(hyb->ell_val); + } + + // Clean up COO part + if(hyb->coo_row_ind != nullptr) + { + hipFree(hyb->coo_row_ind); + } + if(hyb->coo_col_ind != nullptr) + { + hipFree(hyb->coo_col_ind); + } + if(hyb->coo_val != nullptr) + { + hipFree(hyb->coo_val); + } + delete hyb; } catch(const rocsparse_status& status) @@ -292,6 +316,57 @@ rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb) return rocsparse_status_success; } +/******************************************************************************** + * \brief rocsparse_create_csrmv_info is a structure holding the rocsparse + * csrmv info data gathered during csrmv_analysis. It must be initialized using + * rocsparse_create_csrmv_info() and the retured info structure must be passed + * to all subsequent csrmv adaptive function calls. It should be destroyed at + * the end using rocsparse_destroy_csrmv_info(). + *******************************************************************************/ +rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info) +{ + if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else + { + // Allocate + try + { + *info = new _rocsparse_csrmv_info; + } + catch(const rocsparse_status& status) + { + return status; + } + return rocsparse_status_success; + } +} + +/******************************************************************************** + * \brief Destroy csrmv info. + *******************************************************************************/ +rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info) +{ + // Destruct + try + { + // Clean up row blocks + if(info->row_blocks != nullptr) + { + hipFree(info->row_blocks); + } + + delete info; + } + catch(const rocsparse_status& status) + { + return status; + } + return rocsparse_status_success; +} + #ifdef __cplusplus } #endif From 3410ff40bb74409380998d50d8f618261f1144bf Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 20 Jul 2018 15:06:49 +0200 Subject: [PATCH 168/304] removed unused stuff from cmake dependency file --- cmake/Dependencies.cmake | 66 ---------------------------------------- 1 file changed, 66 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 1c3f8563..bee698d7 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -38,72 +38,6 @@ elseif(HIP_PLATFORM STREQUAL "nvcc") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -arch sm_35") endif() -# Test dependencies -if(BUILD_TEST) - if(NOT DEPENDENCIES_FORCE_DOWNLOAD) - find_package(GTest QUIET) - endif() - if(NOT GTEST_FOUND) - message(STATUS "GTest not found. Downloading and building GTest.") - set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "") - download_project(PROJ googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG master - INSTALL_DIR ${GTEST_ROOT} - CMAKE_ARGS -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= - LOG_DOWNLOAD TRUE - LOG_CONFIGURE TRUE - LOG_BUILD TRUE - LOG_INSTALL TRUE - BUILD_PROJECT TRUE - UPDATE_DISCONNECTED TRUE - ) - endif() - find_package(GTest REQUIRED) - # Download some test matrices - set(TEST_MATRICES - nos1 - nos2 - nos3 - nos4 - nos5 - nos6 - nos7 - ) - foreach(m ${TEST_MATRICES}) - if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx") - file(DOWNLOAD ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/lanpro/${m}.mtx.gz - ${CMAKE_CURRENT_BINARY_DIR}/matrices/${m}.mtx.gz) - execute_process(COMMAND gzip -d -f ${m}.mtx.gz - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) - endif() - endforeach() -endif() - -# Benchmark dependencies -if(BUILD_BENCHMARK) - if(NOT DEPENDENCIES_FORCE_DOWNLOAD) - find_package(benchmark QUIET) - endif() - if(NOT benchmark_FOUND) - message(STATUS "Google Benchmark not found. Downloading and building Google Benchmark.") - set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/googlebenchmark CACHE PATH "") - download_project(PROJ googlebenchmark - GIT_REPOSITORY https://github.com/google/benchmark.git - GIT_TAG master - INSTALL_DIR ${GOOGLEBENCHMARK_ROOT} - CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBENCHMARK_ENABLE_TESTING=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= - LOG_DOWNLOAD TRUE - LOG_CONFIGURE TRUE - LOG_BUILD TRUE - LOG_INSTALL TRUE - BUILD_PROJECT TRUE - UPDATE_DISCONNECTED TRUE - ) - endif() - find_package(benchmark REQUIRED CONFIG PATHS ${GOOGLEBENCHMARK_ROOT}) -endif() - # rocPRIM package if(HIP_PLATFORM STREQUAL "hcc") find_package(ROCPRIM QUIET CONFIG PATHS /opt/rocm) From 5440f33764aabd3a1e2d71d58e15ffdb4aa15b50 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 20 Jul 2018 15:07:40 +0200 Subject: [PATCH 169/304] updated matrix market reader and fixed few issues (also a bug related to index_base) --- clients/include/testing_coo2csr.hpp | 2 +- clients/include/testing_coomv.hpp | 2 +- clients/include/testing_coosort.hpp | 2 +- clients/include/testing_csr2csc.hpp | 2 +- clients/include/testing_csr2ell.hpp | 2 +- clients/include/testing_csr2hyb.hpp | 2 +- clients/include/testing_csrsort.hpp | 2 +- clients/include/testing_ellmv.hpp | 2 +- clients/include/testing_hybmv.hpp | 2 +- clients/include/utility.hpp | 41 +++++++++++++++++++++++------ 10 files changed, 42 insertions(+), 17 deletions(-) diff --git a/clients/include/testing_coo2csr.hpp b/clients/include/testing_coo2csr.hpp index a160c7ac..9ebe21b2 100644 --- a/clients/include/testing_coo2csr.hpp +++ b/clients/include/testing_coo2csr.hpp @@ -145,7 +145,7 @@ rocsparse_status testing_coo2csr(Arguments argus) if(argus.filename != "") { if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val) != 0) + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_coomv.hpp b/clients/include/testing_coomv.hpp index 5c1b96f6..c2bd8b17 100644 --- a/clients/include/testing_coomv.hpp +++ b/clients/include/testing_coomv.hpp @@ -225,7 +225,7 @@ rocsparse_status testing_coomv(Arguments argus) { if(argus.filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hrow, hcol, hval) != 0) + if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hrow, hcol, hval, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_coosort.hpp b/clients/include/testing_coosort.hpp index 93dfdf1b..eec8e342 100644 --- a/clients/include/testing_coosort.hpp +++ b/clients/include/testing_coosort.hpp @@ -280,7 +280,7 @@ rocsparse_status testing_coosort(Arguments argus) if(argus.filename != "") { if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val) != 0) + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_csr2csc.hpp b/clients/include/testing_csr2csc.hpp index aea17b66..dcd5095a 100644 --- a/clients/include/testing_csr2csc.hpp +++ b/clients/include/testing_csr2csc.hpp @@ -384,7 +384,7 @@ rocsparse_status testing_csr2csc(Arguments argus) if(argus.filename != "") { if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val) != 0) + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_csr2ell.hpp b/clients/include/testing_csr2ell.hpp index 26657072..7d17af3b 100644 --- a/clients/include/testing_csr2ell.hpp +++ b/clients/include/testing_csr2ell.hpp @@ -390,7 +390,7 @@ rocsparse_status testing_csr2ell(Arguments argus) if(argus.filename != "") { if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val) != 0) + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, csr_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_csr2hyb.hpp b/clients/include/testing_csr2hyb.hpp index 4874826c..7588f561 100644 --- a/clients/include/testing_csr2hyb.hpp +++ b/clients/include/testing_csr2hyb.hpp @@ -221,7 +221,7 @@ rocsparse_status testing_csr2hyb(Arguments argus) if(argus.filename != "") { if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val) != 0) + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp index 6236fb7b..b2068ed7 100644 --- a/clients/include/testing_csrsort.hpp +++ b/clients/include/testing_csrsort.hpp @@ -241,7 +241,7 @@ rocsparse_status testing_csrsort(Arguments argus) if(argus.filename != "") { if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val) != 0) + argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_ellmv.hpp b/clients/include/testing_ellmv.hpp index e09c375d..ef256529 100644 --- a/clients/include/testing_ellmv.hpp +++ b/clients/include/testing_ellmv.hpp @@ -205,7 +205,7 @@ rocsparse_status testing_ellmv(Arguments argus) { if(argus.filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval) != + if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); diff --git a/clients/include/testing_hybmv.hpp b/clients/include/testing_hybmv.hpp index 1c5f67ad..47680a26 100644 --- a/clients/include/testing_hybmv.hpp +++ b/clients/include/testing_hybmv.hpp @@ -205,7 +205,7 @@ rocsparse_status testing_hybmv(Arguments argus) { if(argus.filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval) != + if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index cb90375c..09a941a3 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -331,7 +332,8 @@ rocsparse_int read_mtx_matrix(const char* filename, rocsparse_int& nnz, std::vector& row, std::vector& col, - std::vector& val) + std::vector& val, + rocsparse_index_base idx_base) { printf("Reading matrix %s...", filename); fflush(stdout); @@ -391,7 +393,7 @@ rocsparse_int read_mtx_matrix(const char* filename, } // Check data - if(strcmp(data, "real") != 0) + if(strcmp(data, "real") != 0 && strcmp(data, "integer") != 0 && strcmp(data, "pattern") != 0) { return -1; } @@ -428,26 +430,49 @@ rocsparse_int read_mtx_matrix(const char* filename, rocsparse_int idx = 0; while(fgets(line, 1024, f)) { + if(idx >= nnz) + { + return true; + } + rocsparse_int irow; rocsparse_int icol; - double dval; + T ival; - sscanf(line, "%d %d %lf", &irow, &icol, &dval); + std::istringstream ss(line); - --irow; - --icol; + if(strcmp(data, "pattern")) + { + ss >> irow >> icol; + ival = static_cast(1); + } + else + { + ss >> irow >> icol >> ival; + } + + if(idx_base == rocsparse_index_base_zero) + { + --irow; + --icol; + } unsorted_row[idx] = irow; unsorted_col[idx] = icol; - unsorted_val[idx] = (T)dval; + unsorted_val[idx] = ival; ++idx; if(symm && irow != icol) { + if(idx >= nnz) + { + return true; + } + unsorted_row[idx] = icol; unsorted_col[idx] = irow; - unsorted_val[idx] = (T)dval; + unsorted_val[idx] = ival; ++idx; } } From a0e370439aab3ece5585709f7d407c7109148ecd Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 20 Jul 2018 15:08:22 +0200 Subject: [PATCH 170/304] updated csrmv tests to use real matrices (downloaded during cmake process) --- clients/include/testing_csrmv.hpp | 89 +++++++++++++++++++++++++------ clients/tests/CMakeLists.txt | 37 +++++++++++++ clients/tests/test_csrmv.cpp | 44 +++++++++++++-- 3 files changed, 150 insertions(+), 20 deletions(-) diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index c51e1f9d..c204fece 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -12,6 +12,7 @@ #include "unit.hpp" #include +#include #include using namespace rocsparse; @@ -139,8 +140,17 @@ rocsparse_status testing_csrmv(Arguments argus) T h_beta = argus.beta; rocsparse_operation trans = argus.trans; rocsparse_index_base idx_base = argus.idx_base; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if((m == -99 && n == -99) || argus.timing == 1) + { + filename = argus.filename; + m = n = safe_size; + } + std::unique_ptr test_handle(new handle_struct); rocsparse_handle handle = test_handle->handle; @@ -213,12 +223,12 @@ rocsparse_status testing_csrmv(Arguments argus) } else { - if(argus.filename != "") + if(filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval) != + if(read_mtx_matrix(filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } @@ -314,28 +324,75 @@ rocsparse_status testing_csrmv(Arguments argus) CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * m, hipMemcpyDeviceToHost)); - // CPU + // CPU - do the csrmv row reduction in the same order as the GPU double cpu_time_used = get_time_us(); - for(rocsparse_int i = 0; i < m; ++i) + // Query for warpSize + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, 0); + + rocsparse_int WF_SIZE; + rocsparse_int nnz_per_row = nnz / m; + + if(prop.warpSize == 32) { - hy_gold[i] *= h_beta; - for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; - ++j) + if(nnz_per_row < 4) WF_SIZE = 2; + else if(nnz_per_row < 8) WF_SIZE = 4; + else if(nnz_per_row < 16) WF_SIZE = 8; + else if(nnz_per_row < 32) WF_SIZE = 16; + else WF_SIZE = 32; + } + else if(prop.warpSize == 64) + { + if(nnz_per_row < 4) WF_SIZE = 2; + else if(nnz_per_row < 8) WF_SIZE = 4; + else if(nnz_per_row < 16) WF_SIZE = 8; + else if(nnz_per_row < 32) WF_SIZE = 16; + else if(nnz_per_row < 64) WF_SIZE = 32; + else WF_SIZE = 64; + } + else + { + return rocsparse_status_internal_error; + } + + for(rocsparse_int i=0; i sum(WF_SIZE, 0.0); + + for(rocsparse_int j=hcsr_row_ptr[i]-idx_base; j #include +#include #include +#include typedef rocsparse_index_base base; -typedef std::tuple csrmv_tuple; +typedef std::tuple csrmv_tuple; -int csr_M_range[] = {-1, 0, 10, 500, 7111, 10000}; -int csr_N_range[] = {-3, 0, 33, 842, 4441, 10000}; +int csr_M_range[] = {-99, -1, 0, 500, 7111}; +int csr_N_range[] = {-99, 0, 842, 4441}; std::vector csr_alpha_range = {2.0, 3.0}; std::vector csr_beta_range = {0.0, 1.0}; base csr_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +std::string csr_mtx[] = {"rma10.mtx", + "mac_econ_fwd500.mtx", + "mc2depi.mtx", + "scircuit.mtx", + "ASIC_320k.mtx", + "bmwcra_1.mtx", + "nos1.mtx", + "nos2.mtx", + "nos3.mtx", + "nos4.mtx", + "nos5.mtx", + "nos6.mtx", + "nos7.mtx"}; + class parameterized_csrmv : public testing::TestWithParam { protected: @@ -38,6 +54,25 @@ Arguments setup_csrmv_arguments(csrmv_tuple tup) arg.beta = std::get<3>(tup); arg.idx_base = std::get<4>(tup); arg.timing = 0; + + // Determine absolute path of test matrix + std::string mtx_file = std::get<5>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len-14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + mtx_file; + return arg; } @@ -65,4 +100,5 @@ INSTANTIATE_TEST_CASE_P(csrmv, testing::ValuesIn(csr_N_range), testing::ValuesIn(csr_alpha_range), testing::ValuesIn(csr_beta_range), - testing::ValuesIn(csr_idxbase_range))); + testing::ValuesIn(csr_idxbase_range), + testing::ValuesIn(csr_mtx))); From 26e3db0752833f0362691f04f0c0edf9e3ed8981 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 20 Jul 2018 15:10:02 +0200 Subject: [PATCH 171/304] clang-format --- clients/include/testing_coo2csr.hpp | 10 ++++- clients/include/testing_coosort.hpp | 10 ++++- clients/include/testing_csr2csc.hpp | 10 ++++- clients/include/testing_csr2ell.hpp | 10 ++++- clients/include/testing_csr2hyb.hpp | 10 ++++- clients/include/testing_csrmv.hpp | 61 +++++++++++++++++------------ clients/include/testing_csrsort.hpp | 10 ++++- clients/include/testing_ellmv.hpp | 4 +- clients/include/testing_hybmv.hpp | 4 +- clients/tests/test_csrmv.cpp | 2 +- 10 files changed, 90 insertions(+), 41 deletions(-) diff --git a/clients/include/testing_coo2csr.hpp b/clients/include/testing_coo2csr.hpp index 9ebe21b2..bc6e531d 100644 --- a/clients/include/testing_coo2csr.hpp +++ b/clients/include/testing_coo2csr.hpp @@ -144,8 +144,14 @@ rocsparse_status testing_coo2csr(Arguments argus) { if(argus.filename != "") { - if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base) != 0) + if(read_mtx_matrix(argus.filename.c_str(), + m, + n, + nnz, + hcoo_row_ind, + hcoo_col_ind, + hcoo_val, + idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_coosort.hpp b/clients/include/testing_coosort.hpp index eec8e342..6f6f5004 100644 --- a/clients/include/testing_coosort.hpp +++ b/clients/include/testing_coosort.hpp @@ -279,8 +279,14 @@ rocsparse_status testing_coosort(Arguments argus) { if(argus.filename != "") { - if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base) != 0) + if(read_mtx_matrix(argus.filename.c_str(), + m, + n, + nnz, + hcoo_row_ind, + hcoo_col_ind, + hcoo_val, + idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_csr2csc.hpp b/clients/include/testing_csr2csc.hpp index dcd5095a..5a776c61 100644 --- a/clients/include/testing_csr2csc.hpp +++ b/clients/include/testing_csr2csc.hpp @@ -383,8 +383,14 @@ rocsparse_status testing_csr2csc(Arguments argus) if(argus.filename != "") { - if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != 0) + if(read_mtx_matrix(argus.filename.c_str(), + m, + n, + nnz, + hcoo_row_ind, + hcsr_col_ind, + hcsr_val, + idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_csr2ell.hpp b/clients/include/testing_csr2ell.hpp index 7d17af3b..eb524970 100644 --- a/clients/include/testing_csr2ell.hpp +++ b/clients/include/testing_csr2ell.hpp @@ -389,8 +389,14 @@ rocsparse_status testing_csr2ell(Arguments argus) { if(argus.filename != "") { - if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, csr_base) != 0) + if(read_mtx_matrix(argus.filename.c_str(), + m, + n, + nnz, + hcoo_row_ind, + hcsr_col_ind, + hcsr_val, + csr_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_csr2hyb.hpp b/clients/include/testing_csr2hyb.hpp index 7588f561..a717b5c0 100644 --- a/clients/include/testing_csr2hyb.hpp +++ b/clients/include/testing_csr2hyb.hpp @@ -220,8 +220,14 @@ rocsparse_status testing_csr2hyb(Arguments argus) { if(argus.filename != "") { - if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != 0) + if(read_mtx_matrix(argus.filename.c_str(), + m, + n, + nnz, + hcoo_row_ind, + hcsr_col_ind, + hcsr_val, + idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); return rocsparse_status_internal_error; diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index c204fece..31154dcb 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -225,8 +225,8 @@ rocsparse_status testing_csrmv(Arguments argus) { if(filename != "") { - if(read_mtx_matrix(filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != - 0) + if(read_mtx_matrix( + filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; @@ -336,46 +336,59 @@ rocsparse_status testing_csrmv(Arguments argus) if(prop.warpSize == 32) { - if(nnz_per_row < 4) WF_SIZE = 2; - else if(nnz_per_row < 8) WF_SIZE = 4; - else if(nnz_per_row < 16) WF_SIZE = 8; - else if(nnz_per_row < 32) WF_SIZE = 16; - else WF_SIZE = 32; + if(nnz_per_row < 4) + WF_SIZE = 2; + else if(nnz_per_row < 8) + WF_SIZE = 4; + else if(nnz_per_row < 16) + WF_SIZE = 8; + else if(nnz_per_row < 32) + WF_SIZE = 16; + else + WF_SIZE = 32; } else if(prop.warpSize == 64) { - if(nnz_per_row < 4) WF_SIZE = 2; - else if(nnz_per_row < 8) WF_SIZE = 4; - else if(nnz_per_row < 16) WF_SIZE = 8; - else if(nnz_per_row < 32) WF_SIZE = 16; - else if(nnz_per_row < 64) WF_SIZE = 32; - else WF_SIZE = 64; + if(nnz_per_row < 4) + WF_SIZE = 2; + else if(nnz_per_row < 8) + WF_SIZE = 4; + else if(nnz_per_row < 16) + WF_SIZE = 8; + else if(nnz_per_row < 32) + WF_SIZE = 16; + else if(nnz_per_row < 64) + WF_SIZE = 32; + else + WF_SIZE = 64; } else { return rocsparse_status_internal_error; } - - for(rocsparse_int i=0; i sum(WF_SIZE, 0.0); - - for(rocsparse_int j=hcsr_row_ptr[i]-idx_base; j Date: Fri, 20 Jul 2018 15:48:33 +0200 Subject: [PATCH 172/304] install script reverted for now --- install.sh | 301 ++++++++++++++++------------------------------------- 1 file changed, 89 insertions(+), 212 deletions(-) diff --git a/install.sh b/install.sh index 88501517..afc68ebd 100755 --- a/install.sh +++ b/install.sh @@ -1,7 +1,31 @@ #!/usr/bin/env bash # Author: Kent Knox -#set -x #echo on +# ################################################# +# Pre-requisites check +# ################################################# +# Exit code 0: alls well +# Exit code 1: problems with getopt +# Exit code 2: problems with supported platforms + +# check if getopt command is installed +type getopt > /dev/null +if [[ $? -ne 0 ]]; then + echo "This script uses getopt to parse arguments; try installing the util-linux package"; + exit 1 +fi + +# lsb-release file describes the system +if [[ ! -e "/etc/lsb-release" ]]; then + echo "This script depends on the /etc/lsb-release file" + exit 2 +fi +source /etc/lsb-release + +if [[ ${DISTRIB_ID} != Ubuntu ]]; then + echo "This script only validated with Ubuntu" + exit 2 +fi # ################################################# # helper functions @@ -11,7 +35,6 @@ function display_help() echo "rocsparse build & installation helper script" echo "./install [-h|--help] " echo " [-h|--help] prints this help message" -# echo " [--prefix] Specify an alternate CMAKE_INSTALL_PREFIX for cmake" echo " [-i|--install] install after build" echo " [-d|--dependencies] install build dependencies" echo " [-c|--clients] build library clients too (combines with -i & -d)" @@ -19,31 +42,6 @@ function display_help() echo " [--cuda] build library for cuda backend" } -supported_distro( ) -{ - if [ -z ${ID+foo} ]; then - printf "supported_distro(): \$ID must be set\n" - exit 2 - fi - - case "${ID}" in - ubuntu|centos|rhel|fedora) - true - ;; - *) printf "This script is currently supported on Ubuntu, CentOS, RHEL and Fedora\n" - exit 2 - ;; - esac -} - -# This function is helpful for dockerfiles that do not have sudo installed, but the default user is root -check_exit_code( ) -{ - if (( $? != 0 )); then - exit $? - fi -} - # This function is helpful for dockerfiles that do not have sudo installed, but the default user is root elevate_if_not_root( ) { @@ -51,147 +49,16 @@ elevate_if_not_root( ) if (( ${uid} )); then sudo $@ - check_exit_code else $@ - check_exit_code - fi -} - -# Take an array of packages as input, and install those packages with 'apt' if they are not already installed -install_apt_packages( ) -{ - package_dependencies=("$@") - for package in "${package_dependencies[@]}"; do - if [[ $(dpkg-query --show --showformat='${db:Status-Abbrev}\n' ${package} 2> /dev/null | grep -q "ii"; echo $?) -ne 0 ]]; then - printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" - elevate_if_not_root apt install -y --no-install-recommends ${package} - fi - done -} - -# Take an array of packages as input, and install those packages with 'yum' if they are not already installed -install_yum_packages( ) -{ - package_dependencies=("$@") - for package in "${package_dependencies[@]}"; do - if [[ $(yum list installed ${package} &> /dev/null; echo $? ) -ne 0 ]]; then - printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" - elevate_if_not_root yum install -y ${package} - fi - done -} - -# Take an array of packages as input, and install those packages with 'dnf' if they are not already installed -install_dnf_packages( ) -{ - package_dependencies=("$@") - for package in "${package_dependencies[@]}"; do - if [[ $(dnf list installed ${package} &> /dev/null; echo $? ) -ne 0 ]]; then - printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" - elevate_if_not_root dnf install -y ${package} - fi - done -} - -# Take an array of packages as input, and delegate the work to the appropriate distro installer -# prereq: ${ID} must be defined before calling -# prereq: ${build_clients} must be defined before calling -install_packages( ) -{ - if [ -z ${ID+foo} ]; then - printf "install_packages(): \$ID must be set\n" - exit 2 - fi - - if [ -z ${build_clients+foo} ]; then - printf "install_packages(): \$build_clients must be set\n" - exit 2 - fi - - # dependencies needed for rocsparse and clients to build - local library_dependencies_ubuntu=( "make" "cmake-curses-gui" "python2.7" "python-yaml" "hip_hcc" "pkg-config" ) - local library_dependencies_centos=( "epel-release" "make" "cmake3" "python34" "PyYAML" "hip_hcc" "gcc-c++" ) - local library_dependencies_fedora=( "make" "cmake" "python34" "PyYAML" "hip_hcc" "gcc-c++" "libcxx-devel" "rpm-build" ) - - if [[ "${build_cuda}" == true ]]; then - # Ideally, this could be cuda-cusparse-dev, but the package name has a version number in it - library_dependencies_ubuntu+=( "cuda" ) - library_dependencies_centos+=( "" ) # how to install cuda on centos? - library_dependencies_fedora+=( "" ) # how to install cuda on fedora? fi - - local client_dependencies_ubuntu=( "gfortran" "libboost-program-options-dev" ) - local client_dependencies_centos=( "gcc-gfortran" "boost-devel" ) - local client_dependencies_fedora=( "gcc-gfortran" "boost-devel" ) - - case "${ID}" in - ubuntu) - elevate_if_not_root apt update - install_apt_packages "${library_dependencies_ubuntu[@]}" - - if [[ "${build_clients}" == true ]]; then - install_apt_packages "${client_dependencies_ubuntu[@]}" - fi - ;; - - centos|rhel) -# yum -y update brings *all* installed packages up to date -# without seeking user approval -# elevate_if_not_root yum -y update - install_yum_packages "${library_dependencies_centos[@]}" - - if [[ "${build_clients}" == true ]]; then - install_yum_packages "${client_dependencies_centos[@]}" - fi - ;; - - fedora) -# elevate_if_not_root dnf -y update - install_dnf_packages "${library_dependencies_fedora[@]}" - - if [[ "${build_clients}" == true ]]; then - install_dnf_packages "${client_dependencies_fedora[@]}" - fi - ;; - *) - echo "This script is currently supported on Ubuntu, CentOS, RHEL and Fedora" - exit 2 - ;; - esac } -# ################################################# -# Pre-requisites check -# ################################################# -# Exit code 0: alls well -# Exit code 1: problems with getopt -# Exit code 2: problems with supported platforms - -# check if getopt command is installed -type getopt > /dev/null -if [[ $? -ne 0 ]]; then - echo "This script uses getopt to parse arguments; try installing the util-linux package"; - exit 1 -fi - -# os-release file describes the system -if [[ -e "/etc/os-release" ]]; then - source /etc/os-release -else - echo "This script depends on the /etc/os-release file" - exit 2 -fi - -# The following function exits script if an unsupported distro is detected -supported_distro - # ################################################# # global variables # ################################################# install_package=false install_dependencies=false -install_prefix=rocsparse-install build_clients=false build_cuda=false build_release=true @@ -237,9 +104,6 @@ while true; do --cuda) build_cuda=true shift ;; - --prefix) - install_prefix=${2} - shift 2 ;; --) shift ; break ;; *) echo "Unexpected command line parameter received; aborting"; exit 1 @@ -260,34 +124,53 @@ else rm -rf ${build_dir}/debug fi -# Default cmake executable is called cmake -cmake_executable=cmake - -case "${ID}" in - centos|rhel) - cmake_executable=cmake3 - ;; -esac - # ################################################# -# dependencies +# install build dependencies on request # ################################################# if [[ "${install_dependencies}" == true ]]; then + # dependencies needed for rocsparse and clients to build + library_dependencies_ubuntu=( "make" "cmake-curses-gui" "hip_hcc" "pkg-config" ) + if [[ "${build_cuda}" == false ]]; then + library_dependencies_ubuntu+=( "hcc" ) + else + # Ideally, this could be cuda-cusparse-dev, but the package name has a version number in it + library_dependencies_ubuntu+=( "cuda" ) + fi - install_packages + client_dependencies_ubuntu=( "libboost-program-options-dev" ) + + elevate_if_not_root apt update + + # Dependencies required by main library + for package in "${library_dependencies_ubuntu[@]}"; do + if [[ $(dpkg-query --show --showformat='${db:Status-Abbrev}\n' ${package} 2> /dev/null | grep -q "ii"; echo $?) -ne 0 ]]; then + printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" + elevate_if_not_root apt install -y --no-install-recommends ${package} + fi + done + + # Dependencies required by library client apps + if [[ "${build_clients}" == true ]]; then + for package in "${client_dependencies_ubuntu[@]}"; do + if [[ $(dpkg-query --show --showformat='${db:Status-Abbrev}\n' ${package} 2> /dev/null | grep -q "ii"; echo $?) -ne 0 ]]; then + printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" + elevate_if_not_root apt install -y --no-install-recommends ${package} + fi + done + + # The following builds googletest from source + pushd . + printf "\033[32mBuilding \033[33mgoogletest\033[32m from source" + mkdir -p ${build_dir}/deps && cd ${build_dir}/deps + cmake -DBUILD_BOOST=OFF -DCMAKE_INSTALL_PREFIX=deps-install ../../deps + make -j$(nproc) + # elevate_if_not_root make install + make install + popd + fi - # The following builds googletest from source - pushd . - printf "\033[32mBuilding \033[33mgoogletest\033[32m from source" - mkdir -p ${build_dir}/deps && cd ${build_dir}/deps - ${cmake_executable} -DBUILD_BOOST=OFF ../../deps - make -j$(nproc) - elevate_if_not_root make install - popd fi -# We append customary rocm path; if user provides custom rocm path in ${path}, our -# hard-coded path has lesser priority export PATH=${PATH}:/opt/rocm/bin pushd . @@ -311,24 +194,31 @@ pushd . cmake_client_options="${cmake_client_options} -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCHMARKS=ON" fi - compiler="hcc" - if [[ "${build_cuda}" == true ]]; then - compiler="hipcc" - fi - - # Uncomment for cmake debugging - # CXX=${compiler} ${cmake_executable} -Wdev --debug-output --trace ${cmake_common_options} -DCPACK_SET_DESTDIR=OFF -DCMAKE_INSTALL_PREFIX=rocsparse-install -DCPACK_PACKAGING_INSTALL_PREFIX=/opt/rocm ../.. - - # Build library with AMD toolchain because of existense of device kernels - if [[ "${build_clients}" == true ]]; then - CXX=${compiler} ${cmake_executable} ${cmake_common_options} ${cmake_client_options} -DCPACK_SET_DESTDIR=OFF -DCMAKE_INSTALL_PREFIX=rocsparse-install -DCPACK_PACKAGING_INSTALL_PREFIX=/opt/rocm ../.. + # On ROCm platforms, hcc compiler can build everything + if [[ "${build_cuda}" == false ]]; then + CXX=hcc cmake ${cmake_common_options} ${cmake_client_options} -DCMAKE_PREFIX_PATH="$(pwd)/../deps/deps-install" ../.. + make -j$(nproc) else - CXX=${compiler} ${cmake_executable} ${cmake_common_options} -DCPACK_SET_DESTDIR=OFF -DCMAKE_INSTALL_PREFIX=rocsparse-install -DCPACK_PACKAGING_INSTALL_PREFIX=/opt/rocm ../.. + # The nvidia compile is a little more complicated, in that we split compiling the library from the clients + # We use the hipcc compiler to build the rocsparse library for a cuda backend (hipcc offloads the compile to nvcc) + # However, we run into a compiler incompatibility compiling the clients between nvcc and sparsew3.h 3.3.4 headers. + # The incompatibility is fixed in sparse v3.3.6, but that is not shipped by default on Ubuntu + # As a workaround, since clients do not contain device code, we opt to build clients with the native + # compiler on the platform. The compiler cmake chooses during configuration time is mostly unchangeable, + # so we launch multiple cmake invocation with a different compiler on each. + + # Build library only with hipcc as compiler + CXX=hipcc cmake ${cmake_common_options} -DCMAKE_INSTALL_PREFIX=rocsparse-install -DCPACK_PACKAGE_INSTALL_DIRECTORY=/opt/rocm ../.. + make -j$(nproc) install + + # Build cuda clients with default host compiler + if [[ "${build_clients}" == true ]]; then + pushd clients + cmake ${cmake_common_options} ${cmake_client_options} -DCMAKE_PREFIX_PATH="$(pwd)/../rocsparse-install;$(pwd)/../deps/deps-install" ../../../clients + make -j$(nproc) + popd + fi fi - check_exit_code - - make -j$(nproc) install - check_exit_code # ################################################# # install @@ -336,19 +226,6 @@ pushd . # installing through package manager, which makes uninstalling easy if [[ "${install_package}" == true ]]; then make package - check_exit_code - - case "${ID}" in - ubuntu) - elevate_if_not_root dpkg -i rocsparse-*.deb - ;; - centos|rhel) - elevate_if_not_root yum localinstall rocsparse-*.rpm - ;; - fedora) - elevate_if_not_root dnf install rocsparse-*.rpm - ;; - esac - + elevate_if_not_root dpkg -i rocsparse-*.deb fi popd From ca4d4aa6d2ccf81bd328f0c9e884e68dd1076869 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 20 Jul 2018 17:06:32 +0200 Subject: [PATCH 173/304] mtx to binary conversion during cmake process to speed up testing and few fixes for csrmv test --- clients/include/testing_csrmv.hpp | 23 ++- clients/include/utility.hpp | 93 +++++------ clients/tests/CMakeLists.txt | 7 +- clients/tests/test_csrmv.cpp | 33 ++-- deps/convert | Bin 0 -> 23000 bytes deps/convert.cpp | 258 ++++++++++++++++++++++++++++++ 6 files changed, 339 insertions(+), 75 deletions(-) create mode 100755 deps/convert create mode 100644 deps/convert.cpp diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index 31154dcb..d125de25 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -140,17 +140,23 @@ rocsparse_status testing_csrmv(Arguments argus) T h_beta = argus.beta; rocsparse_operation trans = argus.trans; rocsparse_index_base idx_base = argus.idx_base; + std::string binfile = ""; std::string filename = ""; rocsparse_status status; // When in testing mode, M == N == -99 indicates that we are testing with a real // matrix from cise.ufl.edu - if((m == -99 && n == -99) || argus.timing == 1) + if(m == -99 && n == -99 && argus.timing == 0) { - filename = argus.filename; + binfile = argus.filename; m = n = safe_size; } + if(argus.timing == 1) + { + filename = argus.filename; + } + std::unique_ptr test_handle(new handle_struct); rocsparse_handle handle = test_handle->handle; @@ -216,7 +222,15 @@ rocsparse_status testing_csrmv(Arguments argus) // Initial Data on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + if(read_bin_matrix(binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcol_ind, hval, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) { m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcol_ind, hval, idx_base); nnz = hcsr_row_ptr[m]; @@ -378,8 +392,7 @@ rocsparse_status testing_csrmv(Arguments argus) { if(j + k < hcsr_row_ptr[i + 1] - idx_base) { - sum[k] = - std::fma(h_alpha * hval[j + k], hx[hcol_ind[j + k] - idx_base], sum[k]); + sum[k] = fma(h_alpha * hval[j + k], hx[hcol_ind[j + k] - idx_base], sum[k]); } } } diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index 09a941a3..a3739cf0 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -441,7 +441,7 @@ rocsparse_int read_mtx_matrix(const char* filename, std::istringstream ss(line); - if(strcmp(data, "pattern")) + if(!strcmp(data, "pattern")) { ss >> irow >> icol; ival = static_cast(1); @@ -518,75 +518,64 @@ rocsparse_int read_mtx_matrix(const char* filename, } /* ============================================================================================ */ -/*! \brief Convert matrix from COO to CSR format */ +/*! \brief Read matrix from binary file in CSR format */ template -void coo_to_csr(rocsparse_int nrow, - rocsparse_int ncol, - rocsparse_int nnz, - const std::vector& src_row, - const std::vector& src_col, - const std::vector& src_val, - std::vector& dst_ptr, - std::vector& dst_col, - std::vector& dst_val) +rocsparse_int read_bin_matrix(const char* filename, + rocsparse_int& nrow, + rocsparse_int& ncol, + rocsparse_int& nnz, + std::vector& ptr, + std::vector& col, + std::vector& val, + rocsparse_index_base idx_base) { - dst_ptr.resize(nrow + 1, 0); - dst_col.resize(nnz); - dst_val.resize(nnz); + printf("Reading matrix %s...", filename); + fflush(stdout); - // Compute nnz entries per row - for(rocsparse_int i = 0; i < nnz; ++i) + FILE* f = fopen(filename, "rb"); + if(!f) { - ++dst_ptr[src_row[i]]; + return -1; } - rocsparse_int sum = 0; - for(rocsparse_int i = 0; i < nrow; ++i) - { - rocsparse_int tmp = dst_ptr[i]; - dst_ptr[i] = sum; - sum += tmp; - } - dst_ptr[nrow] = sum; + int err; - // Write column index and values - for(rocsparse_int i = 0; i < nnz; ++i) - { - rocsparse_int row = src_row[i]; - rocsparse_int idx = dst_ptr[row]; + err = fread(&nrow, sizeof(int), 1, f); + err |= fread(&ncol, sizeof(int), 1, f); + err |= fread(&nnz, sizeof(int), 1, f); - dst_col[idx] = src_col[i]; - dst_val[idx] = src_val[i]; + // Allocate memory + ptr.resize(nrow + 1); + col.resize(nnz); + val.resize(nnz); + std::vector tmp(nnz); - ++dst_ptr[row]; - } + err |= fread(ptr.data(), sizeof(int), nrow + 1, f); + err |= fread(col.data(), sizeof(int), nnz, f); + err |= fread(tmp.data(), sizeof(double), nnz, f); - rocsparse_int last = 0; - for(rocsparse_int i = 0; i < nrow + 1; ++i) + for(rocsparse_int i = 0; i < nnz; ++i) { - rocsparse_int tmp = dst_ptr[i]; - dst_ptr[i] = last; - last = tmp; + val[i] = static_cast(tmp[i]); } - for(rocsparse_int i = 0; i < nrow; ++i) + if(idx_base == rocsparse_index_base_one) { - for(rocsparse_int j = dst_ptr[i]; j < dst_ptr[i + 1]; ++j) + for(rocsparse_int i = 0; i < nrow + 1; ++i) { - for(rocsparse_int k = dst_ptr[i]; k < dst_ptr[i + 1] - 1; ++k) - { - // Swap elements - rocsparse_int idx = dst_col[k]; - T val = dst_val[k]; - - dst_col[k] = dst_col[k + 1]; - dst_val[k] = dst_val[k + 1]; + ++ptr[i]; + } - dst_col[k + 1] = idx; - dst_val[k + 1] = val; - } + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++col[i]; } } + + printf("done.\n"); + fflush(stdout); + + return 0; } #ifdef __cplusplus diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 9bc60d61..113d1444 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -5,6 +5,7 @@ # Download some test matrices set(TEST_MATRICES Bova/rma10 + JGD_BIBD/bibd_22_8 Williams/mac_econ_fwd500 Williams/mc2depi Hamm/scircuit @@ -25,7 +26,7 @@ foreach(m ${TEST_MATRICES}) list(GET sep_m 1 mat) # Download test matrices if not already downloaded - if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/matrices/${mat}.mtx") + if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/matrices/${mat}.bin") message(" Downloading and extracting test matrix ${m}.tar.gz") file(DOWNLOAD http://www.cise.ufl.edu/research/sparse/MM/${m}.tar.gz ${CMAKE_CURRENT_BINARY_DIR}/matrices/${mat}.tar.gz) @@ -33,7 +34,9 @@ foreach(m ${TEST_MATRICES}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) execute_process(COMMAND mv ${mat}/${mat}.mtx . WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) - execute_process(COMMAND rm ${mat}.tar.gz ${mat} -rf + execute_process(COMMAND ${CMAKE_SOURCE_DIR}/deps/convert ${mat}.mtx ${mat}.bin + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) + execute_process(COMMAND rm ${mat}.tar.gz ${mat} ${mat}.mtx -rf WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/matrices) endif() endforeach() diff --git a/clients/tests/test_csrmv.cpp b/clients/tests/test_csrmv.cpp index c5ea3a78..f02dc39c 100644 --- a/clients/tests/test_csrmv.cpp +++ b/clients/tests/test_csrmv.cpp @@ -22,19 +22,20 @@ std::vector csr_beta_range = {0.0, 1.0}; base csr_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; -std::string csr_mtx[] = {"rma10.mtx", - "mac_econ_fwd500.mtx", - "mc2depi.mtx", - "scircuit.mtx", - "ASIC_320k.mtx", - "bmwcra_1.mtx", - "nos1.mtx", - "nos2.mtx", - "nos3.mtx", - "nos4.mtx", - "nos5.mtx", - "nos6.mtx", - "nos7.mtx"}; +std::string csr_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; class parameterized_csrmv : public testing::TestWithParam { @@ -56,7 +57,7 @@ Arguments setup_csrmv_arguments(csrmv_tuple tup) arg.timing = 0; // Determine absolute path of test matrix - std::string mtx_file = std::get<5>(tup); + std::string bin_file = std::get<5>(tup); // Get current executables absolute path char path_exe[PATH_MAX]; @@ -71,7 +72,7 @@ Arguments setup_csrmv_arguments(csrmv_tuple tup) } // Matrices are stored at the same path in matrices directory - arg.filename = std::string(path_exe) + "matrices/" + mtx_file; + arg.filename = std::string(path_exe) + "matrices/" + bin_file; return arg; } @@ -101,4 +102,4 @@ INSTANTIATE_TEST_CASE_P(csrmv, testing::ValuesIn(csr_alpha_range), testing::ValuesIn(csr_beta_range), testing::ValuesIn(csr_idxbase_range), - testing::ValuesIn(csr_mtx))); + testing::ValuesIn(csr_bin))); diff --git a/deps/convert b/deps/convert new file mode 100755 index 0000000000000000000000000000000000000000..0d80c004d97acfd8e425f51a0ec8a3f172775563 GIT binary patch literal 23000 zcmeHve|%KcweOi9K*X3cXw(#`OxvNCQAraB$Vil#WCG{F8AwTj)L(}%nLsqZUUCBH z1sa_MJsl6?ZB^c@y|wMr_V(IO(N|lo3X?<$_EigbZBd_st+q2nA%bG0z`XC;=j_R3 z$n@TK@2BrCHe~i$Yp=cb+H0@9_TFpG>~B^ni|jU=z>{6LK_KXGzMrAYPeC~Nbp~3v z`N9Q)UvLXw6ea>Q0e^m0Lb>_G^AU?R&+{xg2XZI==G#rge7m1L@O+wy7I@Ajq@3Ct zO@8aq%-5EfAkPIzNr~!_ti#t6MEUA5`Cdzo!&c7g5lHp&ZeZldSF;D64_kuqocE9V zMt`*yzgoK=5W#-RgdqDI&54#$^#@u5J^Qa_AaXx@;Q1yKE%2Pz`#S28od40}vItjL z?d8&o%aWhxR(FN3H8-u9SN65$hFQ%`tz8>uZJa-E*1WRf&bH#&tlz|+XcsT5V=Zw# zx&e{Qh4>>Ipz_`G-uUzPR=oe#t{>f7d*P=q9@#P}@V|)uGW_vzqX##t5bkpP-6`Mk zOW*MrHxFROre1&-k3})40DQ*7Uz!6?{cw-NXL=6&7U0L@pUi<@mZROg9QZjo+FhOl zUz&r@q8#{RIpie$kJqoOa^QcIgU`Yo^e^O~-pMFmoq{UY zhSY{ghlQ>URW`S^Mr!NVG)K&`@k`XT8#k)$k&e!`*81iq?M`)liJ;!p&|JM9S&cTk zL{+tQ9c>%b=1A*WZJiqF=xFPxUK(blrPuP(HT4Z@eRFeLn5wmIXrT_Q4TsfElD||? z>smK7wKhQL&aRdS6^6}@HMX@!TG2i9EPRI=UU!GuSl`qvXl>1H8zLP-HE3zC zrM{_ESX^1D&M7Vt7AtZ^r8>K~v@HEN`ZT9_^m%siT%mH+sz8OTmKM)3iC3)xk1~v# z9X`YKc=Sa7C{aFveG=9#OcL_ov&}La47VRNd>&VS_09_gP^^YMP_lvFA>>qEjlb_a z_%PgPp0Ez(gez=nx`Wa!irg7^%VjhC%nZD#5tCC% z20nA$n4f{CHhA)9;L~#lg;E9{ib$vG419hXD+sGH@K9_ztw8`PKK+Lcc zOhcMH%-~aV0n<<>_c6EvaN~BsEY(T+2Z5C}iV@l==_eGUYP)pzhdWRt#kS;0hS3a` z4N}ZGV( z-)ZH~T6x^c_gVQtD?dzmVmH*<8&?CXv`tcMDeU6<>(QBCKYf*7*hM2;8c$3^9|OA} zK}w2k-Y4~@U@}F(0orLolTB#537tMeWiOKWCSr&oZcQ{ubEdIQ2lZjG_lMvv8Ij?e z1Gn8eAjRso$DSe0KtWV9_9AH^u^khmH{P{Miv6QYiG_BqRANjoF{Z^B(^!ltEY-Cy zaWQPBHy+UU-1H5xXE`y8Ep?$wx0j(yzscy542hC6O#G!a(UMH8~RZevhJ*N0C-2>npvSrmxVdr0L%=|{4@|29egqZECAxVlyzj0F%S9LEl*i7)biC)R$N&+4e~L&uW*Zxn0P)iPME!2h8@nuP>eB z7KE-TWY~#TDCO2Y6Rrfxc7nCETfhNCiY^qoCJcQW_0r5frV6Qjw-Bv)+y`1mGn1c{WDoVOFdD-4w#1^hvzLbruC8rL;A~0 zjPj|$*yX_mg->s4ko1FHUm992PVc5tIacPWh~;mq@a12*`3pf^0G$*~IF+z;dXca2 zTOob#He6wCJnV);*1`ttAI+%M^G zDc<9fzEA2Kv6J7tWVaKIDf&st`v=K(C{g#M!l2uJ#{s5-_eCK zl&#l#61w8~MBaZ^(Sx25#dlIPmSCcVVl!oZc&G^alk{Xz|D&QGRD8n~df~mBzb@-> z*{HTD#N{A&rb$D30v26DF$(*=n=cD?~`E;imhMTe@gQ8i#^{*-z4Ag zmZehc7SD)eRKtv42ALAOL<--64uD*??UVP9NPT~@hm2`c)AqLnVpYZFNMT!*uk0R4 z@82{_y8^QQJQnnPqT5FGY}r>R_NA&=`LrrudC%s9=)Y|EQ+nagQBmqU0u$XQ?T4QB zY>q1=L44@@);LQ>P_Rix;l06_J*e9xeSdNZ42|^-&HV~CF7|u@n4zuUI%g#ErV^42y-H+M}&Zm z5ve0%*bcr5hSVkN9}j&cJs#Rh#dl==#(;j;>7afc(FSAA?Q*o=DTkMw4jH$fR*W(x za7h0!=yTpD>%ZG{Seq{!Uq4OrPYO?l!=eI7|J|0SU@y+0A!t;IU4iKl!}OT588Zmj zH`u(9Vha|X5;8A@pp^v%xX zQIs%Q%6_%TS9ba4_o1-ZCeJi|v!Jg$&2^R>WQ{q{*iJO&5{>TwG4!iZdtp4v!K3_v z&5tCvq7oBPiWIpye4TSZKH20m&;*56e0Bjh2_?LWgkgnCbQ zy{p7DPo5n8&>5=&(FahW2liBYGNccK8g)P&(48Ks@06X$5>KM096dAXo3Fu5&CHlS zQtT2~)6*5c@^6X8QLHe@sjzlg{{#b=43p1ga>Bue^p|QB{Z*K`3)Ypts}KtgTu~lO z-I=)MB=po97khq0bKcnG$(QuLUt?5+UF6}g@FiX$5|rdg`kSLAiKi{Jb1eG1Kw}<& z&6|vaqW=ZnO9~%I+)Wk2wH~TqF+Y}QP9vev5-Q0C=@R7l%&l>Ad%e5BvoFYfB^ zjbmt>V(%H|bbrA#w!5B4>BMm239W<}#EC&8Ve?Gv)HJqr6f5@r3S4%t=NL(U_rw#s zw-89j1%PN3ZO`u)ds{$x;)#Ix$RH2_?_iaZZx{9N}7%m!w3U5fe1%z$iQ(FtZJ zUGLEX=_V&PQ!#>`^3wRO`@kP0*fAtZKvmV75_|su?y*ZSE6Tn5jKYclEi~Z+(qPb2 z4MAw7q32cX84s)JLF~B!ha!0w**nrg#dlo1_p8vRWK3iJnx@AxlEcU^LTjaPG=pT= z)wG7KrVn>w*>M;{c$$YKbW#YX>V)JVD8*_4s$i%wfQs{wcT|7hPX` z8mEcl;*?uFQlmI?0E6KttyD*wJwY2ri8eXm0m&%uMTf{g`t`!QX^+$nmmSoxmods- zrlR8kaf)+~QEBU%6$F=H&<9r9mjVl%knd+82qT^~qJhc2vn zoo#GnCq4h29Td8$6^xY=)L6qFWzH#md;(+4kBJx~>F$@v+hwc?q(7LUBpGQvJebzQ z4~Ll^P@GI*8MEZz?O=7%;2b|oEC87VU)*jUc&@R)YJg%7?f(;B04hlbG}<3PLFLJ- zN9CqPej}>`XDf%xV0u{p{a8D&eGbA1y?fan+P6o%cM$%d^CFr;I2k*ilJrWDVuzKk zl1V=--m?Yt>lwW(lfLL&dIxr|;yrWI^hKcmnDOsDDfUowB5j)sB@$&Xp^R&uA=T3a z|MVEy=MigC4J0N24qzK2+1}|rAa1(>#@2h_Iedcppt$@u(ZeD(fp+P$gVDnd-(GQX zU-a`y;sec?`1;=Hh>f~Rb9y@R=!ku=(n)p2o(T+g3|8iW67VGqb`DnNQw6c-bD*(E z3|6`@(tyb?Mn@(NRu;kl0RI&5U}X_zmavOFEt>B60dj9&(5OfSevL=Lxbt*CALu(~ z$12i$ph{fUA2jk`q>_L)9)L9KSI|Pa%y53UG~Ot-HhA2PV%tSOL=2=+Y<~$KXD9SB zplx3?ZVQNBL5{{;=9a>H^s=b2$mZ+ca!GWLT?+5@ z{a$y*HcvE5B;SzO6Ne14{?V@&A}5=zmc8$g0k>lnpBw_may zNX%zT&>3tWkD{Vt+~KD=UriiP^N^w+O*}*n$@dYQ8L@^rSJU1Y4(q68_evN?>EV$6 zzvaI7?C81}vh5$5(zqQvvTp@^7i=!lU36@eaBK|5${(om&6o7wY&xPHlA?Q@LA^-w zecJUrSp=CwF$R-u{#i?<2PC+Rb|uDeh+HtME>dEy()e~MobQPC+hX|-1oV9DuJau@ ztl|+nqra{Uzew`#N!$i2WRqAwp6sL6 zp-C4Ne1IPFf{j)|9u@pHqu^^O=smDyW$8zyZzo>`u#4<8xr!vCh2}o&ZIf3rMkOB@-(Dht71{2ahV&{HChJ{I3?xba2GL zyl&iR3+Sh@`-2ne!Z=@$v8#jw#Fj98fLttuhNl=#d%u41-g|)RB**t4!hm^Iy&rA) zhk#K92}7WKOER>m6aGAAmvC0Yw3Bv3op;+L-|?;wt9swxGK1&>hUQ7drbtG>3FCh% z5A+V0ZQQ9>pO$=mUB}Tr`k#lzqnyRwx@KT2U1Y8pW}Jg2jg#gToZdc+0W+h-!$euR;yEkf^i zLBh_fy{~lLq1ZmHE(wjZ52MMBXL&iTx0GqDu4tewrV#A^uvwxOYK zDaJLukWd(UV)x}(krR)jfbGBkgZjmSX6o0Gsoyqm`G3&w^i2Ia{>%DhqHz7b|CbE? zenvn}{f=1reGt;;(yt?jejSM+@J{Qul>ke>FK6nv3I&#a4`5#fr<7&Gh~>NvGoGo$ zHZU_*yjToiHDt+{oX^1F;3O7|d1dcsihhdD<7A4_T5|Oi^umbi$7L^8n_O%(5a6#kFcVNXG!4$Y4z1i)J0077}%Y(Ne`a4VsF%>xK) zRZBh~mv$MT34pM73KeQZQ3R+E-PKyj7QA~T<|#s!Oox$r(XD_=pL8oT{JBD1~(j&eftpRx?B4qxul@(#iZ59J>>=UCMN+v zHjy{Th-8>?WXb!w*rTFz(C3)Vl;k^z-VaF8zoPeB;7GCIQV;`0Z)}LT%EmIMqVLga z0o$weR_LMgERcB`R?ra#Fp#_R3^4gPJ3=48Kp&l_fpKY90y6;^=%n+nz!Yj!b_!2L zz(O|zTC1$TC41wtw_naskBrt7YZtY?jHxGNIJYwGAhZJQ^fT?yMhP1xKh!hQaa8fX z7V`ZD8qzfcrO#pny;=cjn$%X^mUSr*eL_RJ?i zynW1S?i}B0=4M$9d@b}ckYzO^9mf=3-x8zn9n05AxgDW; zAo_s-kYz#1=YZz>y4ba}a~!**;Oh=^J)yJYj@b2;m>NsZ0-q=LP~e%ShrSZCW9b|#3WdH z8c3JebDCTfCW*chb7bjXfh-hz{=)p$(o_1%LQH?`g$FbZTPe&D$%_~S?GdBLoZOYj z(NmZ!(@a+~A1CV~*>K)nRr(4$YZNKA5q8XwqkV8DaX1tIu1UJE*xsNAY@iEjYZJ;4+k@LCPn3L%k!FHY8x*yK>O)0KGfFeIfjHT&`!^|4b)K1$+S8xoj#GJ@%|`@6I) zDFFE>FWOJBWm8YlwpTG`EGu;5%S2-qLy_#kGj zO{d2Cu4YZGZ^sKn4Z@60cgBBV#*9$C*3q;vRNrw2-W+OSfDmqL>u6|dt;efD9g+HG z0q+t;*5WOr_IeF3QndNL*$$c}pH{1$=iYJf7v)7WW0vdK+8g6S{4<^)w#*VfYcjfZsZmrE-S+fEh zM$<5zkkbj44y~B)D`G*U*mU~Oc7aEUD%K%kPjoL*OgWyx|oN0)i`gv(cn&mqPvm1K1-!2hEfgl>_&MU`CbwN ziNAwlyK9AD+vu`gkv}PKyKRz-@Y9F@GF(NumByzR1P{uIzWtq4s)RaR;968rv_!mc zLteMAaO#4q=6JqLEN;PH5Pv@b-^0j-r3J1B>=gw?+Z+`IZo^qoF!TNifr65)69WbF zqm!fpzkO4|{J=m#NnlUG%!=O@xGP>QD5`j^z*X_Pg1p5A{^+Ew6Yrm3IJY?-ARQC$ z|B1hu(Al?1j;b{OMQQ%QG=E3FZ8BJcGZ=#}*n%s<+zP6Gh(FT#5n_y@2kcd2^_JAV)U%!VMg?XGHtS=b~8 zy;|uo?Mx6FQ0Bp30yez{Q^@Hl@teBNvWd(-?woAjQ1I;Jl?Cz1^~eJSJ1099I>*(1 z&{U)FDB5@dr8*RBYxu{@urr|kev$4-%rOJhl8~#?g1R2XA z(k(Pc5x6{pXVQT|^WuK8kF8FaXmmm~X2ZWuE|?i$c48XqBC~^JU(FcDlDAW-BJ|UF zBlHMM+G?-7XyW2+j;PcAgJM^>SgAVKcYv8;F&THVj2F`2Xyavu| z;JgOTYv8;F&THWRvj%ovp zw~vq=>(AkdW_<*a8O-ItTO|yB(ki#u{M|GDJNE2`9=68y2`m4lmG80g-&^^gt^A~w zPn>M_?=ma@s+G^P@H316LsN92y-v)6?n@HYHz zey=4zUmt21{#27UFHO?noiM@@P;$Bq1)#N`FhY&%Qgr(+q5Q zNT0)Q6VJZe0e(5~aeOYqmVQCz%zteKyD}W#Go{yD`9`<>O z*)IDI5AENTgU>eLX}q%g`;bLXUxH<(U*_QRN)G&cIq;t_{OEd0-vv?H%JezTdL8c2 zI<>CG#$sF)qAsaip(=7sEp9Fuy`V(ZT2$QK(MngBs10rE+UB-3_04JnzTodv>$^7M z+xnLF=7<()D86>CuWWoGwXvzSNv-ebsJ~N{%P zrW$RY`BH-1R8ucERUs{_DN*ODYFm3}>0DI?sY*RzmE@aB=hC|YYKyi}HD5}=RVpi* zYARIxUr{qxt*JydN0l4EtpUvHnpZ4ss*%(x6~OvhwPvYWTY?Kx_9Pj)lVwHgrLabvy06Oz!?|jjFQr&HR?^v&N1E6 zGQ3xl@lH$Czi<7eFv}KLD4P2BFDPQS3XScBnrlLosx_sU3E8s?Gh{{0?DX^y*!4l9 zJdc!*XqqXhM?Wj61;4r zYHez`v!ki0dTGQn9C`)P;S3IRHv!Sz1U02`QK$kt4w;hoc zkm>PTkg{v9>FngvQl4@9(kNvazldoR#Z)|=@r*0Dc{@{fZTPNnE{?LOEc-TZa>}k* zI*+|VrLMt%r2S_TQ!vb;dTE1o^;1^82#pQd=s9;?`TZHUIia;wqF`7e|@5 zPmMN%3#8b^QbI9a9@FaAAlEv~d>wzH*UdWGg<`xX7AX!?$g{NiwHAamrg%+P6Yi~R zY7mMET36q>PAG1;vlaZzTN8wgJ>4 z8}UhNCWzXYQi>z%tW|Pd1DJ3K+;MwXWMeqeuBqq;?20in7f_Q2f4)78ifFjL1#iqz zL)0KJr1N5!nNW-cum#Jrxo7yVe9}3Q&QAP1z~dtPzQ*hmD$!=9uK=JJ8LwY!_X8Te z57Et_(GR|V=9p4EWY*_#7R_Sl;e=M6T|Wpse#_p*+uvdLGs+znVkUz|9 zxDbE5e%$T{MDa$J34SZbc8%$Y*XQxRmB3?5o35XZ?_eKCiN`5T$mGGFDZdSW6eHyA z^El9x)F2X%Gg|D@?Qr@#KtnM{4)b`?VXOXQ!~}^zl9S6%I7)PmC!O;8Jg(Gj1wOOe z=k@^komS@n5*EN<1Wn9eE9oVUxdKgz1l z<7m#SObV+ToScnS8ohrka$o0f+)Lu>KT$p}pL^9FbVu{D+_&u^5Z{^wTj{wi-CXw(sB)^At*y=E!AG|)d zqaCP|**=d~jw~c@=gQ0L^PH{(&a9t~Yx-Gr;U{L6{^a#|J~^vCkAMEes?X)*40t^b zyHKBSTz_%@YF6~DpJnO!YC?kG<_}g*_qpPK+E4OgUVkn}SUJ&B;`tOj^R`)m_3>P_ z`EVsCwD4IV A2mk;8 literal 0 HcmV?d00001 diff --git a/deps/convert.cpp b/deps/convert.cpp new file mode 100644 index 00000000..a42bba25 --- /dev/null +++ b/deps/convert.cpp @@ -0,0 +1,258 @@ +#include +#include +#include +#include +#include +#include + +int read_mtx_matrix(const char* filename, + int& nrow, + int& ncol, + int& nnz, + std::vector& row, + std::vector& col, + std::vector& val) +{ + FILE* f = fopen(filename, "r"); + if(!f) + { + return -1; + } + + char line[1024]; + + // Check for banner + if(!fgets(line, 1024, f)) + { + return -1; + } + + char banner[16]; + char array[16]; + char coord[16]; + char data[16]; + char type[16]; + + // Extract banner + if(sscanf(line, "%s %s %s %s %s", banner, array, coord, data, type) != 5) + { + return -1; + } + + // Convert to lower case + for(char *p = array; *p != '\0'; *p = tolower(*p), p++) + ; + for(char *p = coord; *p != '\0'; *p = tolower(*p), p++) + ; + for(char *p = data; *p != '\0'; *p = tolower(*p), p++) + ; + for(char *p = type; *p != '\0'; *p = tolower(*p), p++) + ; + + // Check banner + if(strncmp(line, "%%MatrixMarket", 14) != 0) + { + return -1; + } + + // Check array type + if(strcmp(array, "matrix") != 0) + { + return -1; + } + + // Check coord + if(strcmp(coord, "coordinate") != 0) + { + return -1; + } + + // Check data + if(strcmp(data, "real") != 0 && strcmp(data, "integer") != 0 && strcmp(data, "pattern") != 0) + { + return -1; + } + + // Check type + if(strcmp(type, "general") != 0 && strcmp(type, "symmetric") != 0) + { + return -1; + } + + // Symmetric flag + int symm = !strcmp(type, "symmetric"); + + // Skip comments + while(fgets(line, 1024, f)) + { + if(line[0] != '%') + { + break; + } + } + + // Read dimensions + int snnz; + + sscanf(line, "%d %d %d", &nrow, &ncol, &snnz); + nnz = symm ? (snnz - nrow) * 2 + nrow : snnz; + + std::vector unsorted_row(nnz); + std::vector unsorted_col(nnz); + std::vector unsorted_val(nnz); + + // Read entries + int idx = 0; + while(fgets(line, 1024, f)) + { + if(idx >= nnz) + { + return -1; + } + + int irow; + int icol; + double ival; + + if(!strcmp(data, "pattern")) + { + sscanf(line, "%d %d", &irow, &icol); + ival = 1.0; + } + else + { + sscanf(line, "%d %d %lg", &irow, &icol, &ival); + } + + --irow; + --icol; + + unsorted_row[idx] = irow; + unsorted_col[idx] = icol; + unsorted_val[idx] = ival; + + ++idx; + + if(symm && irow != icol) + { + if(idx >= nnz) + { + return -1; + } + + unsorted_row[idx] = icol; + unsorted_col[idx] = irow; + unsorted_val[idx] = ival; + ++idx; + } + } + fclose(f); + + row.resize(nnz); + col.resize(nnz); + val.resize(nnz); + + // Sort by row and column index + std::vector perm(nnz); + for(int i = 0; i < nnz; ++i) + { + perm[i] = i; + } + + std::sort(perm.begin(), perm.end(), [&](const int& a, const int& b) { + if(unsorted_row[a] < unsorted_row[b]) + { + return true; + } + else if(unsorted_row[a] == unsorted_row[b]) + { + return (unsorted_col[a] < unsorted_col[b]); + } + else + { + return false; + } + }); + + for(int i = 0; i < nnz; ++i) + { + row[i] = unsorted_row[perm[i]]; + col[i] = unsorted_col[perm[i]]; + val[i] = unsorted_val[perm[i]]; + } + + return 0; +} + +int write_bin_matrix( + const char* filename, int m, int n, int nnz, const int* ptr, const int* col, const double* val) +{ + FILE* f = fopen(filename, "wb"); + if(!f) + { + return -1; + } + + int err; + err = fwrite(&m, sizeof(int), 1, f); + err |= fwrite(&n, sizeof(int), 1, f); + err |= fwrite(&nnz, sizeof(int), 1, f); + err |= fwrite(ptr, sizeof(int), m + 1, f); + err |= fwrite(col, sizeof(int), nnz, f); + err |= fwrite(val, sizeof(double), nnz, f); + + fclose(f); + + return 0; +} + +int coo_to_csr(int m, int nnz, const int* src_row, std::vector& dst_ptr) +{ + dst_ptr.resize(m + 1, 0); + + // Compute nnz entries per row + for(int i = 0; i < nnz; ++i) + { + ++dst_ptr[src_row[i] + 1]; + } + + // Exclusive scan + for(int i = 0; i < m; ++i) + { + dst_ptr[i + 1] += dst_ptr[i]; + } + + return 0; +} + +int main(int argc, char* argv[]) +{ + int m; + int n; + int nnz; + + std::vector ptr; + std::vector row; + std::vector col; + std::vector val; + + if(read_mtx_matrix(argv[1], m, n, nnz, row, col, val) != 0) + { + fprintf(stderr, "Cannot open [read] %s.\n", argv[1]); + return -1; + } + + if(coo_to_csr(m, nnz, row.data(), ptr) != 0) + { + fprintf(stderr, "Cannot convert %s from COO to CSR.\n", argv[1]); + return -1; + } + + if(write_bin_matrix(argv[2], m, n, nnz, ptr.data(), col.data(), val.data()) != 0) + { + fprintf(stderr, "Cannot open [write] %s.\n", argv[2]); + return -1; + } + + return 0; +} From 87a3ce88476d92bb8962e0cd152415af303c4b5f Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 24 Jul 2018 08:19:31 +0200 Subject: [PATCH 174/304] fix for csr2csc where buffer_size could be zero --- library/src/conversion/rocsparse_csr2csc.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/library/src/conversion/rocsparse_csr2csc.cpp b/library/src/conversion/rocsparse_csr2csc.cpp index 9ec4f4ff..e72b8d02 100644 --- a/library/src/conversion/rocsparse_csr2csc.cpp +++ b/library/src/conversion/rocsparse_csr2csc.cpp @@ -71,7 +71,8 @@ extern "C" rocsparse_status rocsparse_csr2csc_buffer_size(rocsparse_handle handl // Quick return if possible if(m == 0 || n == 0 || nnz == 0) { - *buffer_size = 0; + // Do not return 0 as buffer size + *buffer_size = 4; return rocsparse_status_success; } @@ -92,6 +93,12 @@ extern "C" rocsparse_status rocsparse_csr2csc_buffer_size(rocsparse_handle handl *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; *buffer_size += sizeof(rocsparse_int) * ((nnz - 1) / 256 + 1) * 256; + // Do not return 0 as size + if(*buffer_size == 0) + { + *buffer_size = 4; + } + return rocsparse_status_success; } From fb6f809230401c80eefe258c0456fc7c0b395d54 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 25 Jul 2018 08:38:23 +0200 Subject: [PATCH 175/304] csr2csc test fix --- clients/include/testing_csr2csc.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clients/include/testing_csr2csc.hpp b/clients/include/testing_csr2csc.hpp index 5a776c61..068affb4 100644 --- a/clients/include/testing_csr2csc.hpp +++ b/clients/include/testing_csr2csc.hpp @@ -335,8 +335,8 @@ rocsparse_status testing_csr2csc(Arguments argus) verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); // Buffer size should be zero - size_t zero = 0; - unit_check_general(1, 1, &zero, &size); + size_t four = 4; + unit_check_general(1, 1, &four, &size); } status = rocsparse_csr2csc(handle, From af2e014af433d183a7b816d62ed00a196b47e7d4 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 25 Jul 2018 10:20:55 +0200 Subject: [PATCH 176/304] working csrmv_adaptive embedded into csrmv --- clients/samples/example_csrmv.cpp | 6 +- library/include/rocsparse-functions.h | 74 +--- library/src/CMakeLists.txt | 1 - library/src/level2/csrmv_adaptive_device.h | 367 ------------------ library/src/level2/csrmv_device.h | 358 +++++++++++++++++ library/src/level2/rocsparse_csrmv.cpp | 265 ++++++++++++- library/src/level2/rocsparse_csrmv.hpp | 222 +++++++++-- .../src/level2/rocsparse_csrmv_adaptive.cpp | 305 --------------- .../src/level2/rocsparse_csrmv_adaptive.hpp | 242 ------------ 9 files changed, 820 insertions(+), 1020 deletions(-) delete mode 100644 library/src/level2/csrmv_adaptive_device.h delete mode 100644 library/src/level2/rocsparse_csrmv_adaptive.cpp delete mode 100644 library/src/level2/rocsparse_csrmv_adaptive.hpp diff --git a/clients/samples/example_csrmv.cpp b/clients/samples/example_csrmv.cpp index 6c5421f4..b5063b4f 100644 --- a/clients/samples/example_csrmv.cpp +++ b/clients/samples/example_csrmv.cpp @@ -97,7 +97,8 @@ int main(int argc, char* argv[]) dAcol, dx, &hbeta, - dy); + dy, + nullptr); } // Device synchronization @@ -124,7 +125,8 @@ int main(int argc, char* argv[]) dAcol, dx, &hbeta, - dy); + dy, + nullptr); } // Device synchronization diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index d2a58c78..f2e4e95f 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -578,6 +578,11 @@ rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, y := alpha * op(A) * x + beta * y + The info parameter is optional and contains information collected by + rocsparse_csrmv_analysis. If present, it will be used to speed up the + csrmv computation. If info == nullptr, general csrmv routine will be + called instead. + @param[in] handle rocsparse_handle. handle to the rocsparse library context queue. @@ -608,6 +613,9 @@ rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, @param[inout] y array of m elements (op(A) = A) or n elements (op(A) = A^T or op(A) = A^H). + @param[in] + info [optional] information collected by rocsparse_csrmv_analysis. + if nullptr is passed, general csrmv routine will be called. ********************************************************************/ ROCSPARSE_EXPORT @@ -623,7 +631,8 @@ rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const float* x, const float* beta, - float* y); + float* y, + const rocsparse_csrmv_info info); ROCSPARSE_EXPORT rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, @@ -638,7 +647,8 @@ rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const double* x, const double* beta, - double* y); + double* y, + const rocsparse_csrmv_info info); /* ROCSPARSE_EXPORT rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, @@ -653,7 +663,8 @@ rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const rocsparse_float_complex* x, const rocsparse_float_complex* beta, - rocsparse_float_complex* y); + rocsparse_float_complex* y, + const rocsparse_csrmv_info info); ROCSPARSE_EXPORT rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, @@ -668,63 +679,10 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const rocsparse_double_complex* x, const rocsparse_double_complex* beta, - rocsparse_double_complex* y); + rocsparse_double_complex* y, + const rocsparse_csrmv_info info); */ - - - - - - - - - - - -ROCSPARSE_EXPORT -rocsparse_status rocsparse_scsrmv_adaptive(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const float* alpha, - const rocsparse_mat_descr descr, - const float* csr_val, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const float* x, - const float* beta, - float* y, - const rocsparse_csrmv_info info); - -ROCSPARSE_EXPORT -rocsparse_status rocsparse_dcsrmv_adaptive(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const double* alpha, - const rocsparse_mat_descr descr, - const double* csr_val, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const double* x, - const double* beta, - double* y, - const rocsparse_csrmv_info info); - - - - - - - - - - - - /*! \brief SPARSE Level 2 API \details diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 44783335..e6f77889 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -20,7 +20,6 @@ set(rocsparse_source # Level2 src/level2/rocsparse_coomv.cpp src/level2/rocsparse_csrmv.cpp - src/level2/rocsparse_csrmv_adaptive.cpp src/level2/rocsparse_ellmv.cpp src/level2/rocsparse_hybmv.cpp diff --git a/library/src/level2/csrmv_adaptive_device.h b/library/src/level2/csrmv_adaptive_device.h deleted file mode 100644 index 2abf254e..00000000 --- a/library/src/level2/csrmv_adaptive_device.h +++ /dev/null @@ -1,367 +0,0 @@ -#pragma once -#ifndef CSRMV_ADAPTIVE_DEVICE_H -#define CSRMV_ADAPTIVE_DEVICE_H - -#include - -static inline __device__ float atomic_add_float_extended(float* ptr, float temp, float* old_sum) -{ - return atomicAdd(ptr, temp); -} - -static inline __device__ double atomic_add_float_extended(double* ptr, double temp, double* old_sum) -{ - unsigned long long newVal; - unsigned long long prevVal; - do - { - prevVal = __double_as_longlong(*ptr); - newVal = __double_as_longlong(temp + *ptr); - } while(atomicCAS((unsigned long long*)ptr, prevVal, newVal) != prevVal); - if(old_sum != 0) - *old_sum = (double)prevVal; - return (double)newVal; -} - -template -static inline __device__ T -sum2_reduce(T cur_sum, T* partial, int lid, int max_size, int reduc_size) -{ - if(max_size > reduc_size) - { - cur_sum += partial[lid + reduc_size]; - __syncthreads(); - partial[lid] = cur_sum; - } - return cur_sum; -} - -template -__device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, - T alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* x, - T beta, - T* y, - rocsparse_index_base idx_base) -{ - __shared__ T partialSums[BLOCKSIZE]; - unsigned int gid = hipBlockIdx_x; - unsigned int lid = hipThreadIdx_x; - - // The row blocks buffer holds a packed set of information used to inform each - // workgroup about how to do its work: - // - // |6666 5555 5555 5544 4444 4444 3333 3333|3322 2222|2222 1111 1111 1100 0000 0000| - // |3210 9876 5432 1098 7654 3210 9876 5432|1098 7654|3210 9876 5432 1098 7654 3210| - // |------------Row Information------------|--------^|---WG ID within a long row---| - // | | flag/|or # reduce threads for short| - // - // The upper 32 bits of each rowBlock entry tell the workgroup the ID of the first - // row it will be working on. When one workgroup calculates multiple rows, this - // rowBlock entry and the next one tell it the range of rows to work on. - // The lower 24 bits are used whenever multiple workgroups calculate a single long - // row. This tells each workgroup its ID within that row, so it knows which - // part of the row to operate on. - // Alternately, on short row blocks, the lower bits are used to communicate - // the number of threads that should be used for the reduction. Pre-calculating - // this on the CPU-side results in a noticable performance uplift on many matrices. - // Bit 24 is a flag bit used so that the multiple WGs calculating a long row can - // know when the first workgroup for that row has finished initializing the output - // value. While this bit is the same as the first workgroup's flag bit, this - // workgroup will spin-loop. - unsigned int row = ((row_blocks[gid] >> (64 - ROW_BITS)) & ((1ULL << ROW_BITS) - 1ULL)); - unsigned int stop_row = - ((row_blocks[gid + 1] >> (64 - ROW_BITS)) & ((1ULL << ROW_BITS) - 1ULL)); - unsigned int num_rows = stop_row - row; - - // Get the workgroup within this long row ID out of the bottom bits of the row block. - unsigned int wg = row_blocks[gid] & ((1 << WG_BITS) - 1); - - // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row. - // If there are more items in this row, we assign more workgroups. -// unsigned int vecStart = hc::__mad24(wg, (unsigned int)(BLOCK_MULTIPLIER * BLOCKSIZE), (unsigned int)csr_row_ptr[row]); - unsigned int vecStart = ((wg >> 8) << 8) * (((unsigned int)(BLOCK_MULTIPLIER * BLOCKSIZE) >> 8) << 8) + csr_row_ptr[row]; - unsigned int vecEnd = (csr_row_ptr[row + 1] > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) - ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE - : csr_row_ptr[row + 1]; - - T temp_sum = 0.; - - // If the next row block starts more than 2 rows away, then we choose CSR-Stream. - // If this is zero (long rows) or one (final workgroup in a long row, or a single - // row in a row block), we want to use the CSR-Vector algorithm(s). - // We have found, through experimentation, that CSR-Vector is generally faster - // when working on 2 rows, due to its simplicity and better reduction method. - if(num_rows > ROWS_FOR_VECTOR) - { - // CSR-Stream case. See Sections III.A and III.B in the SC'14 paper: - // Efficient Sparse Matrix-Vector Multiplication on GPUs using the CSR Storage Format - // for a detailed description of CSR-Stream. - // In a nutshell, the idea is to use all of the threads to stream the matrix - // values into the local memory in a fast, coalesced manner. After that, the - // per-row reductions are done out of the local memory, which is designed - // to handle non-coalsced accesses. - - // The best method for reducing the local memory values depends on the number - // of rows. The SC'14 paper discusses a CSR-Scalar style reduction where - // each thread reduces its own row. This yields good performance if there - // are many (relatively short) rows. However, if they are few (relatively - // long) rows, it's actually better to perform a tree-style reduction where - // multiple threads team up to reduce the same row. - - // The calculation below tells you how many threads this workgroup can allocate - // to each row, assuming that every row gets the same number of threads. - // We want the closest lower (or equal) power-of-2 to this number -- - // that is how many threads can work in each row's reduction using our algorithm. - // For instance, with workgroup size 256, 2 rows = 128 threads, 3 rows = 64 - // threads, 4 rows = 64 threads, 5 rows = 32 threads, etc. - // int numThreadsForRed = get_local_size(0) >> ((CHAR_BIT*sizeof(unsigned - // int))-clz(num_rows-1)); - unsigned int numThreadsForRed = wg; // Same calculation as above, done on host. - - // Stream all of this row block's matrix values into local memory. - // Perform the matvec in parallel with this work. - unsigned int col = csr_row_ptr[row] + lid; - if(gid != (gridDim.x - 1)) - { - for(int i = 0; i < BLOCKSIZE; i += WG_SIZE) - partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i]]; - } - else - { - // This is required so that we stay in bounds for csr_val[] and csr_col_ind[]. - // Otherwise, if the matrix's endpoints don't line up with BLOCKSIZE, - // we will buffer overflow. On today's dGPUs, this doesn't cause problems. - // The values are within a dGPU's page, which is zeroed out on allocation. - // However, this may change in the future (e.g. with shared virtual memory.) - // This causes a minor performance loss because this is the last workgroup - // to be launched, and this loop can't be unrolled. - for(int i = 0; col + i < csr_row_ptr[stop_row]; i += WG_SIZE) - partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i]]; - } - __syncthreads(); - - if(numThreadsForRed > 1) - { - // In this case, we want to have the workgroup perform a tree-style reduction - // of each row. {numThreadsForRed} adjacent threads team up to linearly reduce - // a row into {numThreadsForRed} locations in local memory. - // After that, the entire workgroup does a parallel reduction, and each - // row ends up with an individual answer. - - // {numThreadsForRed} adjacent threads all work on the same row, so their - // start and end values are the same. - // numThreadsForRed guaranteed to be a power of two, so the clz code below - // avoids an integer divide. ~2% perf gain in EXTRA_PRECISION. - // size_t st = lid/numThreadsForRed; - unsigned int local_row = row + (lid >> (31 - __clz(numThreadsForRed))); - unsigned int local_first_val = csr_row_ptr[local_row] - csr_row_ptr[row]; - unsigned int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; - unsigned int threadInBlock = lid & (numThreadsForRed - 1); - - // Not all row blocks are full -- they may have an odd number of rows. As such, - // we need to ensure that adjacent-groups only work on real data for this rowBlock. - if(local_row < stop_row) - { - // This is dangerous -- will infinite loop if your last value is within - // numThreadsForRed of MAX_UINT. Noticable performance gain to avoid a - // long induction variable here, though. - for(unsigned int local_cur_val = local_first_val + threadInBlock; - local_cur_val < local_last_val; - local_cur_val += numThreadsForRed) - temp_sum += partialSums[local_cur_val]; - } - __syncthreads(); - - partialSums[lid] = temp_sum; - - // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed} - // values sitting in the local memory. This means that, roughly, the beginning of - // LDS is full up to {workgroup size} entries. - // Now we perform a parallel reduction that sums together the answers for each - // row in parallel, leaving us an answer in 'temp_sum' for each row. - for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) - { - __syncthreads(); - temp_sum = sum2_reduce(temp_sum, partialSums, lid, numThreadsForRed, i); - } - - if(threadInBlock == 0 && local_row < stop_row) - { - // All of our write-outs check to see if the output vector should first be zeroed. - // If so, just do a write rather than a read-write. Measured to be a slight (~5%) - // performance improvement. - if(beta != 0.) - temp_sum += beta * y[local_row]; - y[local_row] = temp_sum; - } - } - else - { - // In this case, we want to have each thread perform the reduction for a single row. - // Essentially, this looks like performing CSR-Scalar, except it is computed out of - // local memory. - // However, this reduction is also much faster than CSR-Scalar, because local memory - // is designed for scatter-gather operations. - // We need a while loop because there may be more rows than threads in the WG. - unsigned int local_row = row + lid; - while(local_row < stop_row) - { - int local_first_val = (csr_row_ptr[local_row] - csr_row_ptr[row]); - int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; - temp_sum = 0.; - for(int local_cur_val = local_first_val; local_cur_val < local_last_val; - local_cur_val++) - temp_sum += partialSums[local_cur_val]; - - // After you've done the reduction into the temp_sum register, - // put that into the output for each row. - if(beta != 0.) - temp_sum += beta * y[local_row]; - y[local_row] = temp_sum; - local_row += WG_SIZE; - } - } - } - else if(num_rows >= 1 && !wg) // CSR-Vector case. - { - // ^^ The above check says that if this workgroup is supposed to work on <= ROWS_VECTOR - // number of rows then we should do the CSR-Vector algorithm. If we want this row to be - // done with CSR-LongRows, then all of its workgroups (except the last one) will have the - // same stop_row and row. The final workgroup in a LongRow will have stop_row and row - // different, but the internal wg number will be non-zero. - - // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small - // numbers of rows), then it needs to iterate until it reaches the stop_row. - // We don't check <= stop_row because of the potential for unsigned overflow. - while(row < stop_row) - { - // Any workgroup only calculates, at most, BLOCKSIZE items in this row. - // If there are more items in this row, we use CSR-LongRows. - temp_sum = 0.; - vecStart = csr_row_ptr[row]; - vecEnd = csr_row_ptr[row + 1]; - - // Load in a bunch of partial results into your register space, rather than LDS (no - // contention) - // Then dump the partially reduced answers into the LDS for inter-work-item reduction. - // Using a long induction variable to make sure unsigned int overflow doesn't break - // things. - for(unsigned long long j = vecStart + lid; j < vecEnd; j += WG_SIZE) - { - unsigned int col = csr_col_ind[(unsigned int)j]; - temp_sum += alpha * csr_val[(unsigned int)j] * x[col]; - } - - partialSums[lid] = temp_sum; - - // Reduce partial sums - for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) - { - __syncthreads(); - temp_sum = sum2_reduce(temp_sum, partialSums, lid, WG_SIZE, i); - } - - if(lid == 0U) - { - if(beta != 0.) - temp_sum += beta * y[row]; - y[row] = temp_sum; - } - row++; - } - } - else - { - // In CSR-LongRows, we have more than one workgroup calculating this row. - // The output values for those types of rows are stored using atomic_add, because - // more than one parallel workgroup's value makes up the final answer. - // Unfortunately, this makes it difficult to do y=Ax, rather than y=Ax+y, because - // the values still left in y will be added in using the atomic_add. - // - // Our solution is to have the first workgroup in one of these long-rows cases - // properly initaizlie the output vector. All the other workgroups working on this - // row will spin-loop until that workgroup finishes its work. - - // First, figure out which workgroup you are in the row. Bottom 24 bits. - // You can use that to find the global ID for the first workgroup calculating - // this long row. - unsigned int first_wg_in_row = gid - (row_blocks[gid] & ((1ULL << WG_BITS) - 1ULL)); - unsigned int compare_value = row_blocks[gid] & (1ULL << WG_BITS); - - // Bit 24 in the first workgroup is the flag that everyone waits on. - if(gid == first_wg_in_row && lid == 0ULL) - { - // The first workgroup handles the output initialization. - T out_val = y[row]; - temp_sum = (beta - 1.) * out_val; - atomicXor(&row_blocks[first_wg_in_row], (1ULL << WG_BITS)); // Release other workgroups. - } - // For every other workgroup, bit 24 holds the value they wait on. - // If your bit 24 == first_wg's bit 24, you spin loop. - // The first workgroup will eventually flip this bit, and you can move forward. - __syncthreads(); - while( - gid != first_wg_in_row && lid == 0U && - ((atomicMax(&row_blocks[first_wg_in_row], 0ULL) & (1ULL << WG_BITS)) == compare_value)) - ; - __syncthreads(); - - // After you've passed the barrier, update your local flag to make sure that - // the next time through, you know what to wait on. - if(gid != first_wg_in_row && lid == 0ULL) - row_blocks[gid] ^= (1ULL << WG_BITS); - - // All but the final workgroup in a long-row collaboration have the same start_row - // and stop_row. They only run for one iteration. - // Load in a bunch of partial results into your register space, rather than LDS (no - // contention) - // Then dump the partially reduced answers into the LDS for inter-work-item reduction. - unsigned int col = vecStart + lid; - if(row == stop_row) // inner thread, we can hardcode/unroll this loop - { - // Don't put BLOCK_MULTIPLIER*BLOCKSIZE as the stop point, because - // some GPU compilers will *aggressively* unroll this loop. - // That increases register pressure and reduces occupancy. - for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) - { - temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; -#if 2 * WG_SIZE <= BLOCK_MULTIPLIER * BLOCKSIZE - // If you can, unroll this loop once. It somewhat helps performance. - j += WG_SIZE; - temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; -#endif - } - } - else - { - for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) - temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; - } - - partialSums[lid] = temp_sum; - - // Reduce partial sums - for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) - { - __syncthreads(); - temp_sum = sum2_reduce(temp_sum, partialSums, lid, WG_SIZE, i); - } - - if(lid == 0U) - { - atomic_add_float_extended(&y[row], temp_sum, 0); - } - } -} - -#endif // CSRMV_ADAPTIVE_DEVICE_H diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index a1aa7728..4b27bfaa 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -185,4 +185,362 @@ static __device__ void csrmvn_general_device(rocsparse_int m, } } +__device__ static __inline__ void atomic_add(float *address, float val) +{ + atomicAdd(address, val); +} + +__device__ static __inline__ void atomic_add(double *address, double val) +{ + unsigned long long newVal; + unsigned long long prevVal; + + do + { + prevVal = __double_as_longlong(*address); + newVal = __double_as_longlong(val + *address); + } + while(atomicCAS((unsigned long long*)address, prevVal, newVal) != prevVal); +} + +template +static inline __device__ T +sum2_reduce(T cur_sum, T* partial, int lid, int max_size, int reduc_size) +{ + if(max_size > reduc_size) + { + cur_sum += partial[lid + reduc_size]; + __syncthreads(); + partial[lid] = cur_sum; + } + return cur_sum; +} + +template +__device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, + T alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* x, + T beta, + T* y, + rocsparse_index_base idx_base) +{ + __shared__ T partialSums[BLOCKSIZE]; + unsigned int gid = hipBlockIdx_x; + unsigned int lid = hipThreadIdx_x; + + // The row blocks buffer holds a packed set of information used to inform each + // workgroup about how to do its work: + // + // |6666 5555 5555 5544 4444 4444 3333 3333|3322 2222|2222 1111 1111 1100 0000 0000| + // |3210 9876 5432 1098 7654 3210 9876 5432|1098 7654|3210 9876 5432 1098 7654 3210| + // |------------Row Information------------|--------^|---WG ID within a long row---| + // | | flag/|or # reduce threads for short| + // + // The upper 32 bits of each rowBlock entry tell the workgroup the ID of the first + // row it will be working on. When one workgroup calculates multiple rows, this + // rowBlock entry and the next one tell it the range of rows to work on. + // The lower 24 bits are used whenever multiple workgroups calculate a single long + // row. This tells each workgroup its ID within that row, so it knows which + // part of the row to operate on. + // Alternately, on short row blocks, the lower bits are used to communicate + // the number of threads that should be used for the reduction. Pre-calculating + // this on the CPU-side results in a noticable performance uplift on many matrices. + // Bit 24 is a flag bit used so that the multiple WGs calculating a long row can + // know when the first workgroup for that row has finished initializing the output + // value. While this bit is the same as the first workgroup's flag bit, this + // workgroup will spin-loop. + unsigned int row = ((row_blocks[gid] >> (64 - ROW_BITS)) & ((1ULL << ROW_BITS) - 1ULL)); + unsigned int stop_row = + ((row_blocks[gid + 1] >> (64 - ROW_BITS)) & ((1ULL << ROW_BITS) - 1ULL)); + unsigned int num_rows = stop_row - row; + + // Get the workgroup within this long row ID out of the bottom bits of the row block. + unsigned int wg = row_blocks[gid] & ((1 << WG_BITS) - 1); + + // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row. + // If there are more items in this row, we assign more workgroups. + unsigned int vecStart = hc::__mad24(wg, (unsigned int)(BLOCK_MULTIPLIER * BLOCKSIZE), (unsigned int)csr_row_ptr[row]); + unsigned int vecEnd = (csr_row_ptr[row + 1] > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) + ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE + : csr_row_ptr[row + 1]; + + T temp_sum = 0.; + + // If the next row block starts more than 2 rows away, then we choose CSR-Stream. + // If this is zero (long rows) or one (final workgroup in a long row, or a single + // row in a row block), we want to use the CSR-Vector algorithm(s). + // We have found, through experimentation, that CSR-Vector is generally faster + // when working on 2 rows, due to its simplicity and better reduction method. + if(num_rows > ROWS_FOR_VECTOR) + { + // CSR-Stream case. See Sections III.A and III.B in the SC'14 paper: + // Efficient Sparse Matrix-Vector Multiplication on GPUs using the CSR Storage Format + // for a detailed description of CSR-Stream. + // In a nutshell, the idea is to use all of the threads to stream the matrix + // values into the local memory in a fast, coalesced manner. After that, the + // per-row reductions are done out of the local memory, which is designed + // to handle non-coalsced accesses. + + // The best method for reducing the local memory values depends on the number + // of rows. The SC'14 paper discusses a CSR-Scalar style reduction where + // each thread reduces its own row. This yields good performance if there + // are many (relatively short) rows. However, if they are few (relatively + // long) rows, it's actually better to perform a tree-style reduction where + // multiple threads team up to reduce the same row. + + // The calculation below tells you how many threads this workgroup can allocate + // to each row, assuming that every row gets the same number of threads. + // We want the closest lower (or equal) power-of-2 to this number -- + // that is how many threads can work in each row's reduction using our algorithm. + // For instance, with workgroup size 256, 2 rows = 128 threads, 3 rows = 64 + // threads, 4 rows = 64 threads, 5 rows = 32 threads, etc. + // int numThreadsForRed = get_local_size(0) >> ((CHAR_BIT*sizeof(unsigned + // int))-clz(num_rows-1)); + unsigned int numThreadsForRed = wg; // Same calculation as above, done on host. + + // Stream all of this row block's matrix values into local memory. + // Perform the matvec in parallel with this work. + unsigned int col = csr_row_ptr[row] + lid; + if(gid != (gridDim.x - 1)) + { + for(int i = 0; i < BLOCKSIZE; i += WG_SIZE) + partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i]]; + } + else + { + // This is required so that we stay in bounds for csr_val[] and csr_col_ind[]. + // Otherwise, if the matrix's endpoints don't line up with BLOCKSIZE, + // we will buffer overflow. On today's dGPUs, this doesn't cause problems. + // The values are within a dGPU's page, which is zeroed out on allocation. + // However, this may change in the future (e.g. with shared virtual memory.) + // This causes a minor performance loss because this is the last workgroup + // to be launched, and this loop can't be unrolled. + for(int i = 0; col + i < csr_row_ptr[stop_row]; i += WG_SIZE) + partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i]]; + } + __syncthreads(); + + if(numThreadsForRed > 1) + { + // In this case, we want to have the workgroup perform a tree-style reduction + // of each row. {numThreadsForRed} adjacent threads team up to linearly reduce + // a row into {numThreadsForRed} locations in local memory. + // After that, the entire workgroup does a parallel reduction, and each + // row ends up with an individual answer. + + // {numThreadsForRed} adjacent threads all work on the same row, so their + // start and end values are the same. + // numThreadsForRed guaranteed to be a power of two, so the clz code below + // avoids an integer divide. ~2% perf gain in EXTRA_PRECISION. + // size_t st = lid/numThreadsForRed; + unsigned int local_row = row + (lid >> (31 - __clz(numThreadsForRed))); + unsigned int local_first_val = csr_row_ptr[local_row] - csr_row_ptr[row]; + unsigned int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; + unsigned int threadInBlock = lid & (numThreadsForRed - 1); + + // Not all row blocks are full -- they may have an odd number of rows. As such, + // we need to ensure that adjacent-groups only work on real data for this rowBlock. + if(local_row < stop_row) + { + // This is dangerous -- will infinite loop if your last value is within + // numThreadsForRed of MAX_UINT. Noticable performance gain to avoid a + // long induction variable here, though. + for(unsigned int local_cur_val = local_first_val + threadInBlock; + local_cur_val < local_last_val; + local_cur_val += numThreadsForRed) + temp_sum += partialSums[local_cur_val]; + } + __syncthreads(); + + partialSums[lid] = temp_sum; + + // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed} + // values sitting in the local memory. This means that, roughly, the beginning of + // LDS is full up to {workgroup size} entries. + // Now we perform a parallel reduction that sums together the answers for each + // row in parallel, leaving us an answer in 'temp_sum' for each row. + for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + __syncthreads(); + temp_sum = sum2_reduce(temp_sum, partialSums, lid, numThreadsForRed, i); + } + + if(threadInBlock == 0 && local_row < stop_row) + { + // All of our write-outs check to see if the output vector should first be zeroed. + // If so, just do a write rather than a read-write. Measured to be a slight (~5%) + // performance improvement. + if(beta != 0.) + temp_sum += beta * y[local_row]; + y[local_row] = temp_sum; + } + } + else + { + // In this case, we want to have each thread perform the reduction for a single row. + // Essentially, this looks like performing CSR-Scalar, except it is computed out of + // local memory. + // However, this reduction is also much faster than CSR-Scalar, because local memory + // is designed for scatter-gather operations. + // We need a while loop because there may be more rows than threads in the WG. + unsigned int local_row = row + lid; + while(local_row < stop_row) + { + int local_first_val = (csr_row_ptr[local_row] - csr_row_ptr[row]); + int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; + temp_sum = 0.; + for(int local_cur_val = local_first_val; local_cur_val < local_last_val; + local_cur_val++) + temp_sum += partialSums[local_cur_val]; + + // After you've done the reduction into the temp_sum register, + // put that into the output for each row. + if(beta != 0.) + temp_sum += beta * y[local_row]; + y[local_row] = temp_sum; + local_row += WG_SIZE; + } + } + } + else if(num_rows >= 1 && !wg) // CSR-Vector case. + { + // ^^ The above check says that if this workgroup is supposed to work on <= ROWS_VECTOR + // number of rows then we should do the CSR-Vector algorithm. If we want this row to be + // done with CSR-LongRows, then all of its workgroups (except the last one) will have the + // same stop_row and row. The final workgroup in a LongRow will have stop_row and row + // different, but the internal wg number will be non-zero. + + // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small + // numbers of rows), then it needs to iterate until it reaches the stop_row. + // We don't check <= stop_row because of the potential for unsigned overflow. + while(row < stop_row) + { + // Any workgroup only calculates, at most, BLOCKSIZE items in this row. + // If there are more items in this row, we use CSR-LongRows. + temp_sum = 0.; + vecStart = csr_row_ptr[row]; + vecEnd = csr_row_ptr[row + 1]; + + // Load in a bunch of partial results into your register space, rather than LDS (no + // contention) + // Then dump the partially reduced answers into the LDS for inter-work-item reduction. + // Using a long induction variable to make sure unsigned int overflow doesn't break + // things. + for(unsigned long long j = vecStart + lid; j < vecEnd; j += WG_SIZE) + { + unsigned int col = csr_col_ind[(unsigned int)j]; + temp_sum += alpha * csr_val[(unsigned int)j] * x[col]; + } + + partialSums[lid] = temp_sum; + + // Reduce partial sums + for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + __syncthreads(); + temp_sum = sum2_reduce(temp_sum, partialSums, lid, WG_SIZE, i); + } + + if(lid == 0U) + { + if(beta != 0.) + temp_sum += beta * y[row]; + y[row] = temp_sum; + } + row++; + } + } + else + { + // In CSR-LongRows, we have more than one workgroup calculating this row. + // The output values for those types of rows are stored using atomic_add, because + // more than one parallel workgroup's value makes up the final answer. + // Unfortunately, this makes it difficult to do y=Ax, rather than y=Ax+y, because + // the values still left in y will be added in using the atomic_add. + // + // Our solution is to have the first workgroup in one of these long-rows cases + // properly initaizlie the output vector. All the other workgroups working on this + // row will spin-loop until that workgroup finishes its work. + + // First, figure out which workgroup you are in the row. Bottom 24 bits. + // You can use that to find the global ID for the first workgroup calculating + // this long row. + unsigned int first_wg_in_row = gid - (row_blocks[gid] & ((1ULL << WG_BITS) - 1ULL)); + unsigned int compare_value = row_blocks[gid] & (1ULL << WG_BITS); + + // Bit 24 in the first workgroup is the flag that everyone waits on. + if(gid == first_wg_in_row && lid == 0ULL) + { + // The first workgroup handles the output initialization. + T out_val = y[row]; + temp_sum = (beta - 1.) * out_val; + atomicXor(&row_blocks[first_wg_in_row], (1ULL << WG_BITS)); // Release other workgroups. + } + // For every other workgroup, bit 24 holds the value they wait on. + // If your bit 24 == first_wg's bit 24, you spin loop. + // The first workgroup will eventually flip this bit, and you can move forward. + __syncthreads(); + while( + gid != first_wg_in_row && lid == 0U && + ((atomicMax(&row_blocks[first_wg_in_row], 0ULL) & (1ULL << WG_BITS)) == compare_value)) + ; + __syncthreads(); + + // After you've passed the barrier, update your local flag to make sure that + // the next time through, you know what to wait on. + if(gid != first_wg_in_row && lid == 0ULL) + row_blocks[gid] ^= (1ULL << WG_BITS); + + // All but the final workgroup in a long-row collaboration have the same start_row + // and stop_row. They only run for one iteration. + // Load in a bunch of partial results into your register space, rather than LDS (no + // contention) + // Then dump the partially reduced answers into the LDS for inter-work-item reduction. + unsigned int col = vecStart + lid; + if(row == stop_row) // inner thread, we can hardcode/unroll this loop + { + // Don't put BLOCK_MULTIPLIER*BLOCKSIZE as the stop point, because + // some GPU compilers will *aggressively* unroll this loop. + // That increases register pressure and reduces occupancy. + for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) + { + temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; +#if 2 * WG_SIZE <= BLOCK_MULTIPLIER * BLOCKSIZE + // If you can, unroll this loop once. It somewhat helps performance. + j += WG_SIZE; + temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; +#endif + } + } + else + { + for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) + temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; + } + + partialSums[lid] = temp_sum; + + // Reduce partial sums + for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + __syncthreads(); + temp_sum = sum2_reduce(temp_sum, partialSums, lid, WG_SIZE, i); + } + + if(lid == 0U) + { + atomic_add(&y[row], temp_sum); + } + } +} + #endif // CSRMV_DEVICE_H diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index f796eb5a..0fdeec22 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -2,15 +2,270 @@ * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ +#include "definitions.h" #include "rocsparse.h" #include "rocsparse_csrmv.hpp" +__attribute__((unused)) +static unsigned int flp2(unsigned int x) +{ + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + return x - (x >> 1); +} + +// Short rows in CSR-Adaptive are batched together into a single row block. +// If there are a relatively small number of these, then we choose to do +// a horizontal reduction (groups of threads all reduce the same row). +// If there are many threads (e.g. more threads than the maximum size +// of our workgroup) then we choose to have each thread serially reduce +// the row. +// This function calculates the number of threads that could team up +// to reduce these groups of rows. For instance, if you have a +// workgroup size of 256 and 4 rows, you could have 64 threads +// working on each row. If you have 5 rows, only 32 threads could +// reliably work on each row because our reduction assumes power-of-2. +static unsigned long long numThreadsForReduction(unsigned long long num_rows) +{ +#if defined(__INTEL_COMPILER) + return WG_SIZE >> (_bit_scan_reverse(num_rows - 1) + 1); +#elif(defined(__HIP_PLATFORM_NVCC__)) + return flp2(WG_SIZE / num_rows); +#elif(defined(__clang__) && __has_builtin(__builtin_clz)) || \ + !defined(__clang) && defined(__GNUG__) && \ + ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 30202) + return (WG_SIZE >> (8 * sizeof(int) - __builtin_clz(num_rows - 1))); +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) + unsigned long long bit_returned; + _BitScanReverse(&bit_returned, (num_rows - 1)); + return WG_SIZE >> (bit_returned + 1); +#else + return flp2(WG_SIZE / num_rows); +#endif +} + +static void ComputeRowBlocks(unsigned long long* rowBlocks, + size_t& rowBlockSize, + const int* rowDelimiters, + int nRows, + bool allocate_row_blocks = true) +{ + unsigned long long* rowBlocksBase; + int total_row_blocks = 1; // Start at one because of rowBlock[0] + + if(allocate_row_blocks) + { + rowBlocksBase = rowBlocks; + *rowBlocks = 0; + rowBlocks++; + } + unsigned long long sum = 0; + unsigned long long i, last_i = 0; + + // Check to ensure nRows can fit in 32 bits + if((unsigned long long)nRows > (unsigned long long)std::pow(2, ROW_BITS)) + { + fprintf(stderr, "nrow does not fit in 32 bits\n"); + exit(1); + } + + int consecutive_long_rows = 0; + for(i = 1; i <= (unsigned long long)nRows; i++) + { + int row_length = (rowDelimiters[i] - rowDelimiters[i - 1]); + sum += row_length; + + // The following section of code calculates whether you're moving between + // a series of "short" rows and a series of "long" rows. + // This is because the reduction in CSR-Adaptive likes things to be + // roughly the same length. Long rows can be reduced horizontally. + // Short rows can be reduced one-thread-per-row. Try not to mix them. + if(row_length > 128) + consecutive_long_rows++; + else if(consecutive_long_rows > 0) + { + // If it turns out we WERE in a long-row region, cut if off now. + if(row_length < 32) // Now we're in a short-row region + consecutive_long_rows = -1; + else + consecutive_long_rows++; + } + + // If you just entered into a "long" row from a series of short rows, + // then we need to make sure we cut off those short rows. Put them in + // their own workgroup. + if(consecutive_long_rows == 1) + { + // Assuming there *was* a previous workgroup. If not, nothing to do here. + if(i - last_i > 1) + { + if(allocate_row_blocks) + { + *rowBlocks = ((i - 1) << (64 - ROW_BITS)); + // If this row fits into CSR-Stream, calculate how many rows + // can be used to do a parallel reduction. + // Fill in the low-order bits with the numThreadsForRed + if(((i - 1) - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction((i - 1) - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i - 1; + sum = row_length; + } + } + else if(consecutive_long_rows == -1) + { + // We see the first short row after some long ones that + // didn't previously fill up a row block. + if(allocate_row_blocks) + { + *rowBlocks = ((i - 1) << (64 - ROW_BITS)); + if(((i - 1) - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction((i - 1) - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i - 1; + sum = row_length; + consecutive_long_rows = 0; + } + + // Now, what's up with this row? What did it do? + + // exactly one row results in non-zero elements to be greater than blockSize + // This is csr-vector case; bottom WGBITS == workgroup ID + if((i - last_i == 1) && sum > (unsigned long long)BLOCKSIZE) + { + int numWGReq = + static_cast(std::ceil((double)row_length / (BLOCK_MULTIPLIER * BLOCKSIZE))); + + // Check to ensure #workgroups can fit in WGBITS bits, if not + // then the last workgroup will do all the remaining work + numWGReq = (numWGReq < (int)std::pow(2, WG_BITS)) ? numWGReq : (int)std::pow(2, WG_BITS); + + if(allocate_row_blocks) + { + for(int w = 1; w < numWGReq; w++) + { + *rowBlocks = ((i - 1) << (64 - ROW_BITS)); + *rowBlocks |= static_cast(w); + rowBlocks++; + } + *rowBlocks = (i << (64 - ROW_BITS)); + rowBlocks++; + } + total_row_blocks += numWGReq; + last_i = i; + sum = 0; + consecutive_long_rows = 0; + } + // more than one row results in non-zero elements to be greater than blockSize + // This is csr-stream case; bottom WGBITS = number of parallel reduction threads + else if((i - last_i > 1) && sum > (unsigned long long)BLOCKSIZE) + { + i--; // This row won't fit, so back off one. + if(allocate_row_blocks) + { + *rowBlocks = (i << (64 - ROW_BITS)); + if((i - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i; + sum = 0; + consecutive_long_rows = 0; + } + // This is csr-stream case; bottom WGBITS = number of parallel reduction threads + else if(sum == (unsigned long long)BLOCKSIZE) + { + if(allocate_row_blocks) + { + *rowBlocks = (i << (64 - ROW_BITS)); + if((i - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i; + sum = 0; + consecutive_long_rows = 0; + } + } + + // If we didn't fill a row block with the last row, make sure we don't lose it. + if(allocate_row_blocks && (*(rowBlocks - 1) >> (64 - ROW_BITS)) != static_cast(nRows)) + { + *rowBlocks = (static_cast(nRows) << (64 - ROW_BITS)); + if((nRows - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); + rowBlocks++; + } + total_row_blocks++; + + if(allocate_row_blocks) + { + size_t dist = std::distance(rowBlocksBase, rowBlocks); + assert((2 * dist) <= rowBlockSize); + // Update the size of rowBlocks to reflect the actual amount of memory used + // We're multiplying the size by two because the extended precision form of + // CSR-Adaptive requires more space for the final global reduction. + rowBlockSize = 2 * dist; + } + else + rowBlockSize = 2 * total_row_blocks; +} + /* * =========================================================================== * C wrapper * =========================================================================== */ +extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_csrmv_info info) +{ + // row blocks size + info->size = 0; + + // Temporary arrays to hold device data + std::vector hptr(m + 1); + RETURN_IF_HIP_ERROR(hipMemcpy(hptr.data(), csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToHost)); + + // Determine row blocks array size + ComputeRowBlocks((unsigned long long*)NULL, info->size, hptr.data(), m, false); + + // Create row blocks structure + std::vector row_blocks(info->size, 0); + + ComputeRowBlocks(row_blocks.data(), + info->size, + hptr.data(), + m, + true); + +printf("Required buffer size: %lu kByte\n", info->size * sizeof(unsigned long long) >> 10); + + // Allocate memory on device to hold csrmv info + RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->row_blocks, sizeof(unsigned long long) * info->size)); + + // Copy row blocks information to device + RETURN_IF_HIP_ERROR(hipMemcpy(info->row_blocks, row_blocks.data(), sizeof(unsigned long long) * info->size, hipMemcpyHostToDevice)); + + return rocsparse_status_success; +} + extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, rocsparse_operation trans, rocsparse_int m, @@ -23,10 +278,11 @@ extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const float* x, const float* beta, - float* y) + float* y, + const rocsparse_csrmv_info info) { return rocsparse_csrmv_template( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); } extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, @@ -41,8 +297,9 @@ extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const double* x, const double* beta, - double* y) + double* y, + const rocsparse_csrmv_info info) { return rocsparse_csrmv_template( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); } diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index 063aade9..70a86e72 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -13,36 +13,75 @@ #include +#define BLOCKSIZE 1024 +#define BLOCK_MULTIPLIER 3 +#define ROWS_FOR_VECTOR 1 +#define WG_BITS 24 +#define ROW_BITS 32 +#define WG_SIZE 256 + template -__global__ void csrmvn_kernel_host_pointer(rocsparse_int m, - T alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ x, - T beta, - T* __restrict__ y, - rocsparse_index_base idx_base) +__global__ void csrmvn_general_kernel_host_pointer(rocsparse_int m, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + T beta, + T* __restrict__ y, + rocsparse_index_base idx_base) { csrmvn_general_device( m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); } template -__global__ void csrmvn_kernel_device_pointer(rocsparse_int m, - const T* alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ x, - const T* beta, - T* __restrict__ y, - rocsparse_index_base idx_base) +__global__ void csrmvn_general_kernel_device_pointer(rocsparse_int m, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + const T* beta, + T* __restrict__ y, + rocsparse_index_base idx_base) { csrmvn_general_device( m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); } +template +__launch_bounds__(WG_SIZE) +__global__ void csrmvn_adaptive_kernel_host_pointer(unsigned long long* __restrict__ row_blocks, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + T beta, + T* __restrict__ y, + rocsparse_index_base idx_base) +{ + csrmvn_adaptive_device( + row_blocks, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); +} + +template +__launch_bounds__(WG_SIZE) +__global__ void csrmvn_adaptive_kernel_device_pointer(unsigned long long* __restrict__ row_blocks, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + const T* beta, + T* __restrict__ y, + rocsparse_index_base idx_base) +{ + csrmvn_adaptive_device( + row_blocks, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); +} + template rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, rocsparse_operation trans, @@ -56,7 +95,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const T* x, const T* beta, - T* y) + T* y, + const rocsparse_csrmv_info info) { // Check for valid handle and matrix descriptor if(handle == nullptr) @@ -165,6 +205,33 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, return rocsparse_status_success; } + if(info == nullptr) + { + // If csrmv info is not available, call csrmv general + return rocsparse_csrmv_general_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + } + else + { + // If csrmv info is available, call csrmv adaptive + return rocsparse_csrmv_adaptive_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); + } +} + +template +rocsparse_status rocsparse_csrmv_general_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* x, + const T* beta, + T* y) +{ // Stream hipStream_t stream = handle->stream; @@ -183,7 +250,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { if(nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -200,7 +267,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -217,7 +284,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -234,7 +301,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -251,7 +318,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -271,7 +338,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { if(nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -288,7 +355,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -305,7 +372,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -322,7 +389,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -339,7 +406,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 64) { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -356,7 +423,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else { - hipLaunchKernelGGL((csrmvn_kernel_device_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_device_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -388,7 +455,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { if(nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -405,7 +472,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -422,7 +489,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -439,7 +506,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -456,7 +523,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -476,7 +543,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, { if(nnz_per_row < 4) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -493,7 +560,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 8) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -510,7 +577,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 16) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -527,7 +594,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 32) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -544,7 +611,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else if(nnz_per_row < 64) { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -561,7 +628,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, } else { - hipLaunchKernelGGL((csrmvn_kernel_host_pointer), + hipLaunchKernelGGL((csrmvn_general_kernel_host_pointer), csrmvn_blocks, csrmvn_threads, 0, @@ -592,4 +659,77 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, return rocsparse_status_success; } +template +rocsparse_status rocsparse_csrmv_adaptive_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int n, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* x, + const T* beta, + T* y, + const rocsparse_csrmv_info info) +{ + // Stream + hipStream_t stream = handle->stream; + + // Run different csrmv kernels + if(trans == rocsparse_operation_none) + { + dim3 csrmvn_blocks((info->size / 2) - 1); + dim3 csrmvn_threads(WG_SIZE); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + hipLaunchKernelGGL((csrmvn_adaptive_kernel_device_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + info->row_blocks, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + beta, + y, + descr->base); + } + else + { + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + + hipLaunchKernelGGL((csrmvn_adaptive_kernel_host_pointer), + csrmvn_blocks, + csrmvn_threads, + 0, + stream, + info->row_blocks, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + *beta, + y, + descr->base); + } + } + else + { + // TODO + return rocsparse_status_not_implemented; + } + return rocsparse_status_success; +} + #endif // ROCSPARSE_CSRMV_HPP diff --git a/library/src/level2/rocsparse_csrmv_adaptive.cpp b/library/src/level2/rocsparse_csrmv_adaptive.cpp deleted file mode 100644 index dadc54e3..00000000 --- a/library/src/level2/rocsparse_csrmv_adaptive.cpp +++ /dev/null @@ -1,305 +0,0 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#include "definitions.h" -#include "rocsparse.h" -#include "rocsparse_csrmv_adaptive.hpp" - -__attribute__((unused)) -static unsigned int flp2(unsigned int x) -{ - x |= (x >> 1); - x |= (x >> 2); - x |= (x >> 4); - x |= (x >> 8); - x |= (x >> 16); - return x - (x >> 1); -} - -// Short rows in CSR-Adaptive are batched together into a single row block. -// If there are a relatively small number of these, then we choose to do -// a horizontal reduction (groups of threads all reduce the same row). -// If there are many threads (e.g. more threads than the maximum size -// of our workgroup) then we choose to have each thread serially reduce -// the row. -// This function calculates the number of threads that could team up -// to reduce these groups of rows. For instance, if you have a -// workgroup size of 256 and 4 rows, you could have 64 threads -// working on each row. If you have 5 rows, only 32 threads could -// reliably work on each row because our reduction assumes power-of-2. -static unsigned long long numThreadsForReduction(unsigned long long num_rows) -{ -#if defined(__INTEL_COMPILER) - return WG_SIZE >> (_bit_scan_reverse(num_rows - 1) + 1); -#elif(defined(__HIP_PLATFORM_NVCC__)) - return flp2(WG_SIZE / num_rows); -#elif(defined(__clang__) && __has_builtin(__builtin_clz)) || \ - !defined(__clang) && defined(__GNUG__) && \ - ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 30202) - return (WG_SIZE >> (8 * sizeof(int) - __builtin_clz(num_rows - 1))); -#elif defined(_MSC_VER) && (_MSC_VER >= 1400) - unsigned long long bit_returned; - _BitScanReverse(&bit_returned, (num_rows - 1)); - return WG_SIZE >> (bit_returned + 1); -#else - return flp2(WG_SIZE / num_rows); -#endif -} - -static void ComputeRowBlocks(unsigned long long* rowBlocks, - size_t& rowBlockSize, - const int* rowDelimiters, - int nRows, - bool allocate_row_blocks = true) -{ - unsigned long long* rowBlocksBase; - int total_row_blocks = 1; // Start at one because of rowBlock[0] - - if(allocate_row_blocks) - { - rowBlocksBase = rowBlocks; - *rowBlocks = 0; - rowBlocks++; - } - unsigned long long sum = 0; - unsigned long long i, last_i = 0; - - // Check to ensure nRows can fit in 32 bits - if((unsigned long long)nRows > (unsigned long long)std::pow(2, ROW_BITS)) - { - fprintf(stderr, "nrow does not fit in 32 bits\n"); - exit(1); - } - - int consecutive_long_rows = 0; - for(i = 1; i <= (unsigned long long)nRows; i++) - { - int row_length = (rowDelimiters[i] - rowDelimiters[i - 1]); - sum += row_length; - - // The following section of code calculates whether you're moving between - // a series of "short" rows and a series of "long" rows. - // This is because the reduction in CSR-Adaptive likes things to be - // roughly the same length. Long rows can be reduced horizontally. - // Short rows can be reduced one-thread-per-row. Try not to mix them. - if(row_length > 128) - consecutive_long_rows++; - else if(consecutive_long_rows > 0) - { - // If it turns out we WERE in a long-row region, cut if off now. - if(row_length < 32) // Now we're in a short-row region - consecutive_long_rows = -1; - else - consecutive_long_rows++; - } - - // If you just entered into a "long" row from a series of short rows, - // then we need to make sure we cut off those short rows. Put them in - // their own workgroup. - if(consecutive_long_rows == 1) - { - // Assuming there *was* a previous workgroup. If not, nothing to do here. - if(i - last_i > 1) - { - if(allocate_row_blocks) - { - *rowBlocks = ((i - 1) << (64 - ROW_BITS)); - // If this row fits into CSR-Stream, calculate how many rows - // can be used to do a parallel reduction. - // Fill in the low-order bits with the numThreadsForRed - if(((i - 1) - last_i) > (unsigned long long)ROWS_FOR_VECTOR) - *(rowBlocks - 1) |= numThreadsForReduction((i - 1) - last_i); - rowBlocks++; - } - total_row_blocks++; - last_i = i - 1; - sum = row_length; - } - } - else if(consecutive_long_rows == -1) - { - // We see the first short row after some long ones that - // didn't previously fill up a row block. - if(allocate_row_blocks) - { - *rowBlocks = ((i - 1) << (64 - ROW_BITS)); - if(((i - 1) - last_i) > (unsigned long long)ROWS_FOR_VECTOR) - *(rowBlocks - 1) |= numThreadsForReduction((i - 1) - last_i); - rowBlocks++; - } - total_row_blocks++; - last_i = i - 1; - sum = row_length; - consecutive_long_rows = 0; - } - - // Now, what's up with this row? What did it do? - - // exactly one row results in non-zero elements to be greater than blockSize - // This is csr-vector case; bottom WGBITS == workgroup ID - if((i - last_i == 1) && sum > (unsigned long long)BLOCKSIZE) - { - int numWGReq = - static_cast(std::ceil((double)row_length / (BLOCK_MULTIPLIER * BLOCKSIZE))); - - // Check to ensure #workgroups can fit in WGBITS bits, if not - // then the last workgroup will do all the remaining work - numWGReq = (numWGReq < (int)std::pow(2, WG_BITS)) ? numWGReq : (int)std::pow(2, WG_BITS); - - if(allocate_row_blocks) - { - for(int w = 1; w < numWGReq; w++) - { - *rowBlocks = ((i - 1) << (64 - ROW_BITS)); - *rowBlocks |= static_cast(w); - rowBlocks++; - } - *rowBlocks = (i << (64 - ROW_BITS)); - rowBlocks++; - } - total_row_blocks += numWGReq; - last_i = i; - sum = 0; - consecutive_long_rows = 0; - } - // more than one row results in non-zero elements to be greater than blockSize - // This is csr-stream case; bottom WGBITS = number of parallel reduction threads - else if((i - last_i > 1) && sum > (unsigned long long)BLOCKSIZE) - { - i--; // This row won't fit, so back off one. - if(allocate_row_blocks) - { - *rowBlocks = (i << (64 - ROW_BITS)); - if((i - last_i) > (unsigned long long)ROWS_FOR_VECTOR) - *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); - rowBlocks++; - } - total_row_blocks++; - last_i = i; - sum = 0; - consecutive_long_rows = 0; - } - // This is csr-stream case; bottom WGBITS = number of parallel reduction threads - else if(sum == (unsigned long long)BLOCKSIZE) - { - if(allocate_row_blocks) - { - *rowBlocks = (i << (64 - ROW_BITS)); - if((i - last_i) > (unsigned long long)ROWS_FOR_VECTOR) - *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); - rowBlocks++; - } - total_row_blocks++; - last_i = i; - sum = 0; - consecutive_long_rows = 0; - } - } - - // If we didn't fill a row block with the last row, make sure we don't lose it. - if(allocate_row_blocks && (*(rowBlocks - 1) >> (64 - ROW_BITS)) != static_cast(nRows)) - { - *rowBlocks = (static_cast(nRows) << (64 - ROW_BITS)); - if((nRows - last_i) > (unsigned long long)ROWS_FOR_VECTOR) - *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); - rowBlocks++; - } - total_row_blocks++; - - if(allocate_row_blocks) - { - size_t dist = std::distance(rowBlocksBase, rowBlocks); - assert((2 * dist) <= rowBlockSize); - // Update the size of rowBlocks to reflect the actual amount of memory used - // We're multiplying the size by two because the extended precision form of - // CSR-Adaptive requires more space for the final global reduction. - rowBlockSize = 2 * dist; - } - else - rowBlockSize = 2 * total_row_blocks; -} - -/* - * =========================================================================== - * C wrapper - * =========================================================================== - */ - -extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const rocsparse_mat_descr descr, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_csrmv_info info) -{ - // row blocks size - info->size = 0; - - // Temporary arrays to hold device data - std::vector hptr(m + 1); - RETURN_IF_HIP_ERROR(hipMemcpy(hptr.data(), csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToHost)); - - // Determine row blocks array size - ComputeRowBlocks((unsigned long long*)NULL, info->size, hptr.data(), m, false); - - // Create row blocks structure - std::vector row_blocks(info->size, 0); - - ComputeRowBlocks(row_blocks.data(), - info->size, - hptr.data(), - m, - true); - -printf("Required buffer size: %lu kByte\n", info->size * sizeof(unsigned long long) >> 10); - - // Allocate memory on device to hold csrmv info - RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->row_blocks, sizeof(unsigned long long) * info->size)); - - // Copy row blocks information to device - RETURN_IF_HIP_ERROR(hipMemcpy(info->row_blocks, row_blocks.data(), sizeof(unsigned long long) * info->size, hipMemcpyHostToDevice)); - - return rocsparse_status_success; -} - -extern "C" rocsparse_status rocsparse_scsrmv_adaptive(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const float* alpha, - const rocsparse_mat_descr descr, - const float* csr_val, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const float* x, - const float* beta, - float* y, - const rocsparse_csrmv_info info) -{ - return rocsparse_csrmv_adaptive_template( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); -} - -extern "C" rocsparse_status rocsparse_dcsrmv_adaptive(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const double* alpha, - const rocsparse_mat_descr descr, - const double* csr_val, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const double* x, - const double* beta, - double* y, - const rocsparse_csrmv_info info) -{ - return rocsparse_csrmv_adaptive_template( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); -} diff --git a/library/src/level2/rocsparse_csrmv_adaptive.hpp b/library/src/level2/rocsparse_csrmv_adaptive.hpp deleted file mode 100644 index a7f81da4..00000000 --- a/library/src/level2/rocsparse_csrmv_adaptive.hpp +++ /dev/null @@ -1,242 +0,0 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#pragma once -#ifndef ROCSPARSE_CSRMV_ADAPTIVE_HPP -#define ROCSPARSE_CSRMV_ADAPTIVE_HPP - -#include "rocsparse.h" -#include "handle.h" -#include "utility.h" -#include "csrmv_adaptive_device.h" - -#include - -#define BLOCKSIZE 1024 -#define BLOCK_MULTIPLIER 3 -#define ROWS_FOR_VECTOR 1 -#define WG_BITS 24 -#define ROW_BITS 32 -#define WG_SIZE 256 - -template -__launch_bounds__(WG_SIZE) -__global__ void csrmvn_adaptive_kernel_host_pointer(unsigned long long* __restrict__ row_blocks, - T alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ x, - T beta, - T* __restrict__ y, - rocsparse_index_base idx_base) -{ - csrmvn_adaptive_device( - row_blocks, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); -} - -template -__launch_bounds__(WG_SIZE) -__global__ void csrmvn_adaptive_kernel_device_pointer(unsigned long long* __restrict__ row_blocks, - const T* alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ x, - const T* beta, - T* __restrict__ y, - rocsparse_index_base idx_base) -{ - csrmvn_adaptive_device( - row_blocks, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); -} - -template -rocsparse_status rocsparse_csrmv_adaptive_template(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int n, - rocsparse_int nnz, - const T* alpha, - const rocsparse_mat_descr descr, - const T* csr_val, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* x, - const T* beta, - T* y, - const rocsparse_csrmv_info info) -{ - // Check for valid handle and matrix descriptor - if(handle == nullptr) - { - return rocsparse_status_invalid_handle; - } - else if(descr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(info == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Logging TODO bench logging - if(handle->pointer_mode == rocsparse_pointer_mode_host) - { - log_trace(handle, - replaceX("rocsparse_Xcsrmv_adaptive"), - trans, - m, - n, - nnz, - *alpha, - (const void*&)descr, - (const void*&)csr_val, - (const void*&)csr_row_ptr, - (const void*&)csr_col_ind, - (const void*&)x, - *beta, - (const void*&)y, - (const void*&)info); - } - else - { - log_trace(handle, - replaceX("rocsparse_Xcsrmv_adaptive"), - trans, - m, - n, - nnz, - (const void*&)alpha, - (const void*&)descr, - (const void*&)csr_val, - (const void*&)csr_row_ptr, - (const void*&)csr_col_ind, - (const void*&)x, - (const void*&)beta, - (const void*&)y, - (const void*&)info); - } - - // Check index base - if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) - { - return rocsparse_status_invalid_value; - } - if(descr->type != rocsparse_matrix_type_general) - { - // TODO - return rocsparse_status_not_implemented; - } - - // Check sizes - if(m < 0) - { - return rocsparse_status_invalid_size; - } - else if(n < 0) - { - return rocsparse_status_invalid_size; - } - else if(nnz < 0) - { - return rocsparse_status_invalid_size; - } - - // Check pointer arguments - if(csr_val == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(csr_row_ptr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(csr_col_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(x == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(y == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(alpha == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(beta == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Quick return if possible - if(m == 0 || n == 0 || nnz == 0) - { - return rocsparse_status_success; - } - - // Stream - hipStream_t stream = handle->stream; - - // Run different csrmv kernels - if(trans == rocsparse_operation_none) - { - dim3 csrmvn_adaptive_blocks((info->size / 2) - 1); - dim3 csrmvn_adaptive_threads(WG_SIZE); - - if(handle->pointer_mode == rocsparse_pointer_mode_device) - { - hipLaunchKernelGGL((csrmvn_adaptive_kernel_device_pointer), - csrmvn_adaptive_blocks, - csrmvn_adaptive_threads, - 0, - stream, - info->row_blocks, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - beta, - y, - descr->base); - } - else - { - if(*alpha == 0.0 && *beta == 1.0) - { - return rocsparse_status_success; - } - - hipLaunchKernelGGL((csrmvn_adaptive_kernel_host_pointer), - csrmvn_adaptive_blocks, - csrmvn_adaptive_threads, - 0, - stream, - info->row_blocks, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - *beta, - y, - descr->base); - } - } - else - { - //TODO - return rocsparse_status_not_implemented; - } - - return rocsparse_status_success; -} - -#endif // ROCSPARSE_CSRMV_ADAPTIVE_HPP From 93de6508c4d8a2a543c33cc93a5c8f621cfe2239 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 26 Jul 2018 08:15:17 +0200 Subject: [PATCH 177/304] added ASSERT_NEAR check functionality to test utilities --- clients/common/unit.cpp | 41 +++++++++++++++++++++++++++++++++++++++- clients/include/unit.hpp | 3 +++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index d7533216..8fddb96c 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -23,7 +23,7 @@ * ==================================================== */ /*! \brief Template: gtest unit compare two matrices float/double/complex */ -// Do not put a wrapper over ASSERT_FLOAT_EQ, sincer assert exit the current function NOT the test +// Do not put a wrapper over ASSERT_FLOAT_EQ, since assert exit the current function NOT the test // case // a wrapper will cause the loop keep going @@ -90,3 +90,42 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, size_t* hCPU, size_t* } } } + +/*! \brief Template: gtest unit compare two matrices float/double/complex */ +// Do not put a wrapper over ASSERT_FLOAT_EQ, since assert exit the current function NOT the test +// case +// a wrapper will cause the loop keep going + +template <> +void unit_check_near(rocsparse_int M, rocsparse_int N, float* hCPU, float* hGPU) +{ + for(rocsparse_int j = 0; j < N; j++) + { + for(rocsparse_int i = 0; i < M; i++) + { + float compare_val = std::max(std::abs(hCPU[i + j] * 1e-6f), 10 * FLT_EPSILON); +#ifdef GOOGLE_TEST + ASSERT_NEAR(hCPU[i + j], hGPU[i + j], compare_val); +#else + assert(std::abs(hCPU[i + j] - hGPU[i + j]) < compare_val); +#endif + } + } +} + +template <> +void unit_check_near(rocsparse_int M, rocsparse_int N, double* hCPU, double* hGPU) +{ + for(rocsparse_int j = 0; j < N; j++) + { + for(rocsparse_int i = 0; i < M; i++) + { + double compare_val = std::max(std::abs(hCPU[i + j] * 1e-14), 10 * DBL_EPSILON); +#ifdef GOOGLE_TEST + ASSERT_NEAR(hCPU[i + j], hGPU[i + j], compare_val); +#else + assert(std::abs(hCPU[i + j] - hGPU[i + j]) < compare_val); +#endif + } + } +} diff --git a/clients/include/unit.hpp b/clients/include/unit.hpp index da76ba6f..d2cad518 100644 --- a/clients/include/unit.hpp +++ b/clients/include/unit.hpp @@ -29,4 +29,7 @@ template void unit_check_general(rocsparse_int M, rocsparse_int N, T* hCPU, T* hGPU); +template +void unit_check_near(rocsparse_int M, rocsparse_int N, T* hCPU, T* hGPU); + #endif // UNIT_HPP From e6840e1d5c3fb5e0133033d1f60c14f30352e39a Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 26 Jul 2018 08:16:24 +0200 Subject: [PATCH 178/304] added boolean value to Arguments testing class --- clients/include/utility.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index a3739cf0..48bf61fb 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -636,6 +636,7 @@ class Arguments rocsparse_int temp = 0; std::string filename = ""; + bool bswitch = false; Arguments& operator=(const Arguments& rhs) { @@ -662,6 +663,7 @@ class Arguments temp = rhs.temp; filename = rhs.filename; + bswitch = rhs.bswitch; return *this; } From e0f3c5016907d690a2b31f625c9959a4a350a698 Mon Sep 17 00:00:00 2001 From: Nico <31079890+ntrost57@users.noreply.github.com> Date: Fri, 27 Jul 2018 08:29:39 +0200 Subject: [PATCH 179/304] forcing CXX=hcc / hipcc for rocPRIM dependency download --- cmake/Dependencies.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index bee698d7..1a9920eb 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -49,6 +49,7 @@ if(HIP_PLATFORM STREQUAL "hcc") GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git GIT_TAG master INSTALL_DIR ${ROCPRIM_ROOT} + CXX HCC CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= LOG_DOWNLOAD TRUE LOG_CONFIGURE TRUE @@ -68,6 +69,7 @@ elseif(HIP_PLATFORM STREQUAL "nvcc") GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git GIT_TAG master INSTALL_DIR ${ROCPRIM_ROOT} + CXX HIPCC CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= LOG_DOWNLOAD TRUE LOG_CONFIGURE TRUE From ea48d43be5d6e596100547f12630e728882764b4 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 29 Jul 2018 11:54:05 +0200 Subject: [PATCH 180/304] forcing hcc/hipcc compiler for rocprim dependency download --- cmake/Dependencies.cmake | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 1a9920eb..cb46a8e7 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -49,8 +49,7 @@ if(HIP_PLATFORM STREQUAL "hcc") GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git GIT_TAG master INSTALL_DIR ${ROCPRIM_ROOT} - CXX HCC - CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= + CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hcc LOG_DOWNLOAD TRUE LOG_CONFIGURE TRUE LOG_INSTALL TRUE @@ -69,8 +68,7 @@ elseif(HIP_PLATFORM STREQUAL "nvcc") GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git GIT_TAG master INSTALL_DIR ${ROCPRIM_ROOT} - CXX HIPCC - CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= + CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= -DCMAKE_CXX_COMPILER=${HIP_HIPCC_EXECUTABLE} LOG_DOWNLOAD TRUE LOG_CONFIGURE TRUE LOG_INSTALL TRUE From b8c85a51cb5ae6196f0be0114bcbbd198947cc28 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 29 Jul 2018 12:15:03 +0200 Subject: [PATCH 181/304] csr2coo test with real matrices --- clients/include/testing_csr2coo.hpp | 91 +++++++++++++++++++++++------ clients/tests/test_csr2coo.cpp | 43 ++++++++++++-- 2 files changed, 111 insertions(+), 23 deletions(-) diff --git a/clients/include/testing_csr2coo.hpp b/clients/include/testing_csr2coo.hpp index 0d497bb6..b8d3ced8 100644 --- a/clients/include/testing_csr2coo.hpp +++ b/clients/include/testing_csr2coo.hpp @@ -73,8 +73,23 @@ rocsparse_status testing_csr2coo(Arguments argus) rocsparse_int n = argus.N; rocsparse_int safe_size = 100; rocsparse_index_base idx_base = argus.idx_base; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + double scale = 0.02; if(m > 1000 || n > 1000) { @@ -117,31 +132,55 @@ rocsparse_status testing_csr2coo(Arguments argus) return rocsparse_status_success; } - // For testing, assemble a COO matrix and convert it to CSR first (on host) - // Host structures - std::vector hcoo_row_ind(nnz); - std::vector hcoo_row_ind_gold(nnz); - std::vector hcoo_col_ind(nnz); - std::vector hcoo_val(nnz); + std::vector hcsr_row_ptr; + std::vector hcoo_row_ind; + std::vector hcol_ind; + std::vector hval(nnz); - // Sample initial COO matrix on CPU + // Initial data on CPU srand(12345ULL); - gen_matrix_coo(m, n, nnz, hcoo_row_ind_gold, hcoo_col_ind, hcoo_val, idx_base); - - // Convert COO to CSR - std::vector hcsr_row_ptr(m + 1); - - // csr2coo on host - for(rocsparse_int i = 0; i < nnz; ++i) + if(binfile != "") { - ++hcsr_row_ptr[hcoo_row_ind_gold[i] + 1 - idx_base]; + if(read_bin_matrix(binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcol_ind, hval, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } } - - hcsr_row_ptr[0] = idx_base; - for(rocsparse_int i = 0; i < m; ++i) + else if(argus.laplacian) { - hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcol_ind, hval, idx_base); + nnz = hcsr_row_ptr[m]; + } + else + { + if(filename != "") + { + if(read_mtx_matrix( + filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base); + } + + // Convert COO to CSR + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } } // Allocate memory on the device @@ -170,9 +209,23 @@ rocsparse_status testing_csr2coo(Arguments argus) rocsparse_csr2coo(handle, dcsr_row_ptr, nnz, m, dcoo_row_ind, idx_base)); // Copy output from device to host + hcoo_row_ind.resize(nnz); CHECK_HIP_ERROR(hipMemcpy( hcoo_row_ind.data(), dcoo_row_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToHost)); + // CPU conversion to COO + std::vector hcoo_row_ind_gold(nnz); + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int row_begin = hcsr_row_ptr[i] - idx_base; + rocsparse_int row_end = hcsr_row_ptr[i + 1] - idx_base; + + for(rocsparse_int j = row_begin; j < row_end; ++j) + { + hcoo_row_ind_gold[j] = i + idx_base; + } + } + // Unit check unit_check_general(1, nnz, hcoo_row_ind_gold.data(), hcoo_row_ind.data()); } diff --git a/clients/tests/test_csr2coo.cpp b/clients/tests/test_csr2coo.cpp index 04b4d940..b2a36731 100644 --- a/clients/tests/test_csr2coo.cpp +++ b/clients/tests/test_csr2coo.cpp @@ -9,14 +9,29 @@ #include #include -typedef std::tuple csr2coo_tuple; +typedef std::tuple csr2coo_tuple; -int csr2coo_M_range[] = {-1, 0, 10, 500, 872, 1000}; -int csr2coo_N_range[] = {-3, 0, 33, 242, 623, 1000}; +int csr2coo_M_range[] = {-99, -1, 0, 10, 500, 872, 1000}; +int csr2coo_N_range[] = {-99, 0, 33, 242, 623, 1000}; rocsparse_index_base csr2coo_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +std::string csr2coo_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_csr2coo : public testing::TestWithParam { protected: @@ -33,6 +48,25 @@ Arguments setup_csr2coo_arguments(csr2coo_tuple tup) arg.N = std::get<1>(tup); arg.idx_base = std::get<2>(tup); arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<3>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + return arg; } @@ -50,4 +84,5 @@ INSTANTIATE_TEST_CASE_P(csr2coo, parameterized_csr2coo, testing::Combine(testing::ValuesIn(csr2coo_M_range), testing::ValuesIn(csr2coo_N_range), - testing::ValuesIn(csr2coo_idx_base_range))); + testing::ValuesIn(csr2coo_idx_base_range), + testing::ValuesIn(csr2coo_bin))); From 9abd5d615e8ca9ae73d8ce7161eabaac58f472fd Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 29 Jul 2018 12:21:01 +0200 Subject: [PATCH 182/304] csrsort test with real matrices --- clients/include/testing_csrsort.hpp | 41 +++++++++++++++++++-------- clients/tests/test_csrsort.cpp | 43 ++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp index 0df31e53..b9d2cab5 100644 --- a/clients/include/testing_csrsort.hpp +++ b/clients/include/testing_csrsort.hpp @@ -146,8 +146,23 @@ rocsparse_status testing_csrsort(Arguments argus) rocsparse_int safe_size = 100; rocsparse_int permute = argus.temp; rocsparse_index_base idx_base = argus.idx_base; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + size_t buffer_size = 0; double scale = 0.02; @@ -231,25 +246,29 @@ rocsparse_status testing_csrsort(Arguments argus) // Sample initial COO matrix on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + if(read_bin_matrix( + binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) { m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base); nnz = hcsr_row_ptr[m]; } else { - if(argus.filename != "") + if(filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), - m, - n, - nnz, - hcoo_row_ind, - hcsr_col_ind, - hcsr_val, - idx_base) != 0) + if(read_mtx_matrix( + filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != + 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } diff --git a/clients/tests/test_csrsort.cpp b/clients/tests/test_csrsort.cpp index 083703ab..21f75450 100644 --- a/clients/tests/test_csrsort.cpp +++ b/clients/tests/test_csrsort.cpp @@ -9,13 +9,28 @@ #include #include -typedef std::tuple csrsort_tuple; +typedef std::tuple csrsort_tuple; -int csrsort_M_range[] = {-1, 0, 10, 500, 872, 1000}; -int csrsort_N_range[] = {-3, 0, 33, 242, 623, 1000}; +int csrsort_M_range[] = {-99, -1, 0, 10, 500, 872, 1000}; +int csrsort_N_range[] = {-99, 0, 33, 242, 623, 1000}; int csrsort_perm[] = {0, 1}; rocsparse_index_base csrsort_base[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +std::string csrsort_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_csrsort : public testing::TestWithParam { protected: @@ -33,6 +48,25 @@ Arguments setup_csrsort_arguments(csrsort_tuple tup) arg.temp = std::get<2>(tup); arg.idx_base = std::get<3>(tup); arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<4>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + return arg; } @@ -51,4 +85,5 @@ INSTANTIATE_TEST_CASE_P(csrsort, testing::Combine(testing::ValuesIn(csrsort_M_range), testing::ValuesIn(csrsort_N_range), testing::ValuesIn(csrsort_perm), - testing::ValuesIn(csrsort_base))); + testing::ValuesIn(csrsort_base), + testing::ValuesIn(csrsort_bin))); From 1754055275ebcca6e2e77576fca3c86c5816f1dc Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Sun, 29 Jul 2018 12:28:08 +0200 Subject: [PATCH 183/304] csr2csc test with real matrices --- clients/include/testing_csr2csc.hpp | 41 +++++++++++++++++++-------- clients/tests/test_csr2csc.cpp | 43 ++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/clients/include/testing_csr2csc.hpp b/clients/include/testing_csr2csc.hpp index 068affb4..13b5fe09 100644 --- a/clients/include/testing_csr2csc.hpp +++ b/clients/include/testing_csr2csc.hpp @@ -274,8 +274,23 @@ rocsparse_status testing_csr2csc(Arguments argus) rocsparse_int safe_size = 100; rocsparse_index_base idx_base = argus.idx_base; rocsparse_action action = argus.action; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + size_t size = 0; double scale = 0.02; @@ -372,7 +387,16 @@ rocsparse_status testing_csr2csc(Arguments argus) // Sample initial COO matrix on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + if(read_bin_matrix( + binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) { m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base); nnz = hcsr_row_ptr[m]; @@ -381,18 +405,13 @@ rocsparse_status testing_csr2csc(Arguments argus) { std::vector hcoo_row_ind; - if(argus.filename != "") + if(filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), - m, - n, - nnz, - hcoo_row_ind, - hcsr_col_ind, - hcsr_val, - idx_base) != 0) + if(read_mtx_matrix( + filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != + 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } diff --git a/clients/tests/test_csr2csc.cpp b/clients/tests/test_csr2csc.cpp index 121d3fe6..543a0502 100644 --- a/clients/tests/test_csr2csc.cpp +++ b/clients/tests/test_csr2csc.cpp @@ -9,16 +9,31 @@ #include #include -typedef std::tuple csr2csc_tuple; +typedef std::tuple csr2csc_tuple; -int csr2csc_M_range[] = {-1, 0, 10, 500, 872, 1000}; -int csr2csc_N_range[] = {-3, 0, 33, 242, 623, 1000}; +int csr2csc_M_range[] = {-99, -1, 0, 10, 500, 872, 1000}; +int csr2csc_N_range[] = {-99, 0, 33, 242, 623, 1000}; rocsparse_action csr2csc_action_range[] = {rocsparse_action_numeric, rocsparse_action_symbolic}; rocsparse_index_base csr2csc_csr_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +std::string csr2csc_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_csr2csc : public testing::TestWithParam { protected: @@ -36,6 +51,25 @@ Arguments setup_csr2csc_arguments(csr2csc_tuple tup) arg.action = std::get<2>(tup); arg.idx_base = std::get<3>(tup); arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<4>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + return arg; } @@ -62,4 +96,5 @@ INSTANTIATE_TEST_CASE_P(csr2csc, testing::Combine(testing::ValuesIn(csr2csc_M_range), testing::ValuesIn(csr2csc_N_range), testing::ValuesIn(csr2csc_action_range), - testing::ValuesIn(csr2csc_csr_base_range))); + testing::ValuesIn(csr2csc_csr_base_range), + testing::ValuesIn(csr2csc_bin))); From c31f1110f5f178f20462d4874a84be219cc96e89 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 30 Jul 2018 14:13:55 +0200 Subject: [PATCH 184/304] gpu mem leak fix in hyb descriptor --- library/src/rocsparse_auxiliary.cpp | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index 27f90b15..907b0177 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -2,6 +2,7 @@ * Copyright 2018 Advanced Micro Devices, Inc. * ************************************************************************ */ +#include "definitions.h" #include "handle.h" #include "rocsparse.h" #include "utility.h" @@ -283,6 +284,30 @@ rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb) // Destruct try { + // Clean up ELL part + if(hyb->ell_col_ind != nullptr) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->ell_col_ind)); + } + if(hyb->ell_val != nullptr) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->ell_val)); + } + + // Clean up COO part + if(hyb->coo_row_ind != nullptr) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->coo_row_ind)); + } + if(hyb->coo_col_ind != nullptr) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->coo_col_ind)); + } + if(hyb->coo_val != nullptr) + { + RETURN_IF_HIP_ERROR(hipFree(hyb->coo_val)); + } + delete hyb; } catch(const rocsparse_status& status) From 1962f4d320a15a95f06c601ec3d86a029f6e6654 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 30 Jul 2018 14:15:03 +0200 Subject: [PATCH 185/304] fix for utility function that did not close a file after reading --- clients/include/utility.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index a3739cf0..bdd664ef 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -554,6 +554,8 @@ rocsparse_int read_bin_matrix(const char* filename, err |= fread(col.data(), sizeof(int), nnz, f); err |= fread(tmp.data(), sizeof(double), nnz, f); + fclose(f); + for(rocsparse_int i = 0; i < nnz; ++i) { val[i] = static_cast(tmp[i]); From 8ffe44a5e24b0479db943228e98ed69aac8fea59 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 30 Jul 2018 14:15:21 +0200 Subject: [PATCH 186/304] updated all tests to use real matrix data; removed duplicated tests --- clients/include/testing_coo2csr.hpp | 53 +++++++++++++++---- clients/include/testing_coosort.hpp | 55 +++++++++++++++----- clients/include/testing_csr2coo.hpp | 1 + clients/include/testing_csr2csc.hpp | 1 + clients/include/testing_csr2ell.hpp | 42 +++++++++++---- clients/include/testing_csr2hyb.hpp | 42 +++++++++++---- clients/include/testing_csrsort.hpp | 1 + clients/tests/test_coo2csr.cpp | 68 ++++++++++++++++++++++++ clients/tests/test_coosort.cpp | 72 ++++++++++++++++++++++++++ clients/tests/test_csr2coo.cpp | 43 ++++++++++++++-- clients/tests/test_csr2csc.cpp | 51 ++++++++++++++++-- clients/tests/test_csr2ell.cpp | 77 +++++++++++++++++++++++++++ clients/tests/test_csr2hyb.cpp | 80 +++++++++++++++++++++++++++++ clients/tests/test_csrmv.cpp | 52 +++++++++++++++++-- clients/tests/test_csrsort.cpp | 43 ++++++++++++++-- 15 files changed, 619 insertions(+), 62 deletions(-) diff --git a/clients/include/testing_coo2csr.hpp b/clients/include/testing_coo2csr.hpp index bc6e531d..3dd24c98 100644 --- a/clients/include/testing_coo2csr.hpp +++ b/clients/include/testing_coo2csr.hpp @@ -13,6 +13,7 @@ #include #include +#include using namespace rocsparse; using namespace rocsparse_test; @@ -73,8 +74,23 @@ rocsparse_status testing_coo2csr(Arguments argus) rocsparse_int n = argus.N; rocsparse_int safe_size = 100; rocsparse_index_base idx_base = argus.idx_base; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + double scale = 0.02; if(m > 1000 || n > 1000) { @@ -124,7 +140,27 @@ rocsparse_status testing_coo2csr(Arguments argus) // Sample initial COO matrix on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + std::vector hptr(m + 1); + if(read_bin_matrix(binfile.c_str(), m, n, nnz, hptr, hcoo_col_ind, hcoo_val, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + + hcoo_row_ind.resize(nnz); + + // Convert to COO + for(rocsparse_int i = 0; i < m; ++i) + { + for(rocsparse_int j = hptr[i]; j < hptr[i + 1]; ++j) + { + hcoo_row_ind[j - idx_base] = i + idx_base; + } + } + } + else if(argus.laplacian) { std::vector hptr(m + 1); m = n = gen_2d_laplacian(argus.laplacian, hptr, hcoo_col_ind, hcoo_val, idx_base); @@ -142,18 +178,13 @@ rocsparse_status testing_coo2csr(Arguments argus) } else { - if(argus.filename != "") + if(filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), - m, - n, - nnz, - hcoo_row_ind, - hcoo_col_ind, - hcoo_val, - idx_base) != 0) + if(read_mtx_matrix( + filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base) != + 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } diff --git a/clients/include/testing_coosort.hpp b/clients/include/testing_coosort.hpp index 6f6f5004..eca9c7d9 100644 --- a/clients/include/testing_coosort.hpp +++ b/clients/include/testing_coosort.hpp @@ -13,6 +13,7 @@ #include #include +#include using namespace rocsparse; using namespace rocsparse_test; @@ -173,8 +174,23 @@ rocsparse_status testing_coosort(Arguments argus) rocsparse_int by_row = argus.trans == rocsparse_operation_none; rocsparse_int permute = argus.temp; rocsparse_index_base idx_base = argus.idx_base; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + size_t buffer_size = 0; double scale = 0.02; @@ -259,7 +275,27 @@ rocsparse_status testing_coosort(Arguments argus) // Sample initial COO matrix on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + std::vector hcsr_row_ptr; + if(read_bin_matrix( + binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcoo_col_ind, hcoo_val, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + + // Convert CSR to COO + hcoo_row_ind.resize(nnz); + for(rocsparse_int i = 0; i < m; ++i) + { + for(rocsparse_int j = hcsr_row_ptr[i]; j < hcsr_row_ptr[i + 1]; ++j) + { + hcoo_row_ind[j - idx_base] = i + idx_base; + } + } + } + else if(argus.laplacian) { std::vector hcsr_row_ptr; m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcoo_col_ind, hcoo_val, idx_base); @@ -271,24 +307,19 @@ rocsparse_status testing_coosort(Arguments argus) { for(rocsparse_int j = hcsr_row_ptr[i]; j < hcsr_row_ptr[i + 1]; ++j) { - hcoo_row_ind[j] = i + idx_base; + hcoo_row_ind[j - idx_base] = i + idx_base; } } } else { - if(argus.filename != "") + if(filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), - m, - n, - nnz, - hcoo_row_ind, - hcoo_col_ind, - hcoo_val, - idx_base) != 0) + if(read_mtx_matrix( + filename.c_str(), m, n, nnz, hcoo_row_ind, hcoo_col_ind, hcoo_val, idx_base) != + 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } diff --git a/clients/include/testing_csr2coo.hpp b/clients/include/testing_csr2coo.hpp index b8d3ced8..8758005b 100644 --- a/clients/include/testing_csr2coo.hpp +++ b/clients/include/testing_csr2coo.hpp @@ -13,6 +13,7 @@ #include #include +#include using namespace rocsparse; using namespace rocsparse_test; diff --git a/clients/include/testing_csr2csc.hpp b/clients/include/testing_csr2csc.hpp index 13b5fe09..250df296 100644 --- a/clients/include/testing_csr2csc.hpp +++ b/clients/include/testing_csr2csc.hpp @@ -13,6 +13,7 @@ #include #include +#include using namespace rocsparse; using namespace rocsparse_test; diff --git a/clients/include/testing_csr2ell.hpp b/clients/include/testing_csr2ell.hpp index eb524970..bfef7cd3 100644 --- a/clients/include/testing_csr2ell.hpp +++ b/clients/include/testing_csr2ell.hpp @@ -13,6 +13,7 @@ #include #include +#include using namespace rocsparse; using namespace rocsparse_test; @@ -267,8 +268,23 @@ rocsparse_status testing_csr2ell(Arguments argus) rocsparse_int safe_size = 100; rocsparse_index_base csr_base = argus.idx_base; rocsparse_index_base ell_base = argus.idx_base2; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + double scale = 0.02; if(m > 1000 || n > 1000) { @@ -380,25 +396,29 @@ rocsparse_status testing_csr2ell(Arguments argus) // Sample initial COO matrix on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + if(read_bin_matrix( + binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcsr_col_ind, hcsr_val, csr_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) { m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, csr_base); nnz = hcsr_row_ptr[m]; } else { - if(argus.filename != "") + if(filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), - m, - n, - nnz, - hcoo_row_ind, - hcsr_col_ind, - hcsr_val, - csr_base) != 0) + if(read_mtx_matrix( + filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, csr_base) != + 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } diff --git a/clients/include/testing_csr2hyb.hpp b/clients/include/testing_csr2hyb.hpp index a717b5c0..57ce2043 100644 --- a/clients/include/testing_csr2hyb.hpp +++ b/clients/include/testing_csr2hyb.hpp @@ -13,6 +13,7 @@ #include #include +#include using namespace rocsparse; using namespace rocsparse_test; @@ -144,8 +145,23 @@ rocsparse_status testing_csr2hyb(Arguments argus) rocsparse_index_base idx_base = argus.idx_base; rocsparse_hyb_partition part = argus.part; rocsparse_int user_ell_width = argus.ell_width; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + double scale = 0.02; if(m > 1000 || n > 1000) { @@ -211,25 +227,29 @@ rocsparse_status testing_csr2hyb(Arguments argus) // Sample initial COO matrix on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + if(read_bin_matrix( + binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) { m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base); nnz = hcsr_row_ptr[m]; } else { - if(argus.filename != "") + if(filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), - m, - n, - nnz, - hcoo_row_ind, - hcsr_col_ind, - hcsr_val, - idx_base) != 0) + if(read_mtx_matrix( + filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != + 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp index b9d2cab5..32f37c5f 100644 --- a/clients/include/testing_csrsort.hpp +++ b/clients/include/testing_csrsort.hpp @@ -13,6 +13,7 @@ #include #include +#include using namespace rocsparse; using namespace rocsparse_test; diff --git a/clients/tests/test_coo2csr.cpp b/clients/tests/test_coo2csr.cpp index f1df1c47..acd8873a 100644 --- a/clients/tests/test_coo2csr.cpp +++ b/clients/tests/test_coo2csr.cpp @@ -8,8 +8,10 @@ #include #include #include +#include typedef std::tuple coo2csr_tuple; +typedef std::tuple coo2csr_bin_tuple; int coo2csr_M_range[] = {-1, 0, 10, 500, 872, 1000}; int coo2csr_N_range[] = {-3, 0, 33, 242, 623, 1000}; @@ -17,6 +19,21 @@ int coo2csr_N_range[] = {-3, 0, 33, 242, 623, 1000}; rocsparse_index_base coo2csr_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +std::string coo2csr_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_coo2csr : public testing::TestWithParam { protected: @@ -26,6 +43,15 @@ class parameterized_coo2csr : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_coo2csr_bin : public testing::TestWithParam +{ + protected: + parameterized_coo2csr_bin() {} + virtual ~parameterized_coo2csr_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_coo2csr_arguments(coo2csr_tuple tup) { Arguments arg; @@ -36,6 +62,35 @@ Arguments setup_coo2csr_arguments(coo2csr_tuple tup) return arg; } +Arguments setup_coo2csr_arguments(coo2csr_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.idx_base = std::get<0>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<1>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + TEST(coo2csr_bad_arg, coo2csr) { testing_coo2csr_bad_arg(); } TEST_P(parameterized_coo2csr, coo2csr) @@ -46,8 +101,21 @@ TEST_P(parameterized_coo2csr, coo2csr) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_coo2csr_bin, coo2csr_bin) +{ + Arguments arg = setup_coo2csr_arguments(GetParam()); + + rocsparse_status status = testing_coo2csr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(coo2csr, parameterized_coo2csr, testing::Combine(testing::ValuesIn(coo2csr_M_range), testing::ValuesIn(coo2csr_N_range), testing::ValuesIn(coo2csr_idx_base_range))); + +INSTANTIATE_TEST_CASE_P(coo2csr_bin, + parameterized_coo2csr_bin, + testing::Combine(testing::ValuesIn(coo2csr_idx_base_range), + testing::ValuesIn(coo2csr_bin))); diff --git a/clients/tests/test_coosort.cpp b/clients/tests/test_coosort.cpp index 624c799b..3027b7d6 100644 --- a/clients/tests/test_coosort.cpp +++ b/clients/tests/test_coosort.cpp @@ -8,8 +8,10 @@ #include #include #include +#include typedef std::tuple coosort_tuple; +typedef std::tuple coosort_bin_tuple; int coosort_M_range[] = {-1, 0, 10, 500, 3872, 10000}; int coosort_N_range[] = {-3, 0, 33, 242, 1623, 10000}; @@ -17,6 +19,21 @@ rocsparse_operation coosort_trans[] = {rocsparse_operation_none, rocsparse_opera int coosort_perm[] = {0, 1}; rocsparse_index_base coosort_base[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +std::string coosort_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_coosort : public testing::TestWithParam { protected: @@ -26,6 +43,15 @@ class parameterized_coosort : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_coosort_bin : public testing::TestWithParam +{ + protected: + parameterized_coosort_bin() {} + virtual ~parameterized_coosort_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_coosort_arguments(coosort_tuple tup) { Arguments arg; @@ -38,6 +64,37 @@ Arguments setup_coosort_arguments(coosort_tuple tup) return arg; } +Arguments setup_coosort_arguments(coosort_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.trans = std::get<0>(tup); + arg.temp = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<3>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + TEST(coosort_bad_arg, coosort) { testing_coosort_bad_arg(); } TEST_P(parameterized_coosort, coosort) @@ -48,6 +105,14 @@ TEST_P(parameterized_coosort, coosort) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_coosort_bin, coosort_bin) +{ + Arguments arg = setup_coosort_arguments(GetParam()); + + rocsparse_status status = testing_coosort(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(coosort, parameterized_coosort, testing::Combine(testing::ValuesIn(coosort_M_range), @@ -55,3 +120,10 @@ INSTANTIATE_TEST_CASE_P(coosort, testing::ValuesIn(coosort_trans), testing::ValuesIn(coosort_perm), testing::ValuesIn(coosort_base))); + +INSTANTIATE_TEST_CASE_P(coosort_bin, + parameterized_coosort_bin, + testing::Combine(testing::ValuesIn(coosort_trans), + testing::ValuesIn(coosort_perm), + testing::ValuesIn(coosort_base), + testing::ValuesIn(coosort_bin))); diff --git a/clients/tests/test_csr2coo.cpp b/clients/tests/test_csr2coo.cpp index b2a36731..ebfcc48c 100644 --- a/clients/tests/test_csr2coo.cpp +++ b/clients/tests/test_csr2coo.cpp @@ -8,11 +8,13 @@ #include #include #include +#include -typedef std::tuple csr2coo_tuple; +typedef std::tuple csr2coo_tuple; +typedef std::tuple csr2coo_bin_tuple; -int csr2coo_M_range[] = {-99, -1, 0, 10, 500, 872, 1000}; -int csr2coo_N_range[] = {-99, 0, 33, 242, 623, 1000}; +int csr2coo_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int csr2coo_N_range[] = {-3, 0, 33, 242, 623, 1000}; rocsparse_index_base csr2coo_idx_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; @@ -41,6 +43,15 @@ class parameterized_csr2coo : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_csr2coo_bin : public testing::TestWithParam +{ + protected: + parameterized_csr2coo_bin() {} + virtual ~parameterized_csr2coo_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_csr2coo_arguments(csr2coo_tuple tup) { Arguments arg; @@ -48,9 +59,19 @@ Arguments setup_csr2coo_arguments(csr2coo_tuple tup) arg.N = std::get<1>(tup); arg.idx_base = std::get<2>(tup); arg.timing = 0; + return arg; +} + +Arguments setup_csr2coo_arguments(csr2coo_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.idx_base = std::get<0>(tup); + arg.timing = 0; // Determine absolute path of test matrix - std::string bin_file = std::get<3>(tup); + std::string bin_file = std::get<1>(tup); // Get current executables absolute path char path_exe[PATH_MAX]; @@ -80,9 +101,21 @@ TEST_P(parameterized_csr2coo, csr2coo) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_csr2coo_bin, csr2coo_bin) +{ + Arguments arg = setup_csr2coo_arguments(GetParam()); + + rocsparse_status status = testing_csr2coo(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(csr2coo, parameterized_csr2coo, testing::Combine(testing::ValuesIn(csr2coo_M_range), testing::ValuesIn(csr2coo_N_range), - testing::ValuesIn(csr2coo_idx_base_range), + testing::ValuesIn(csr2coo_idx_base_range))); + +INSTANTIATE_TEST_CASE_P(csr2coo_bin, + parameterized_csr2coo_bin, + testing::Combine(testing::ValuesIn(csr2coo_idx_base_range), testing::ValuesIn(csr2coo_bin))); diff --git a/clients/tests/test_csr2csc.cpp b/clients/tests/test_csr2csc.cpp index 543a0502..88f5bb06 100644 --- a/clients/tests/test_csr2csc.cpp +++ b/clients/tests/test_csr2csc.cpp @@ -8,11 +8,13 @@ #include #include #include +#include -typedef std::tuple csr2csc_tuple; +typedef std::tuple csr2csc_tuple; +typedef std::tuple csr2csc_bin_tuple; -int csr2csc_M_range[] = {-99, -1, 0, 10, 500, 872, 1000}; -int csr2csc_N_range[] = {-99, 0, 33, 242, 623, 1000}; +int csr2csc_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int csr2csc_N_range[] = {-3, 0, 33, 242, 623, 1000}; rocsparse_action csr2csc_action_range[] = {rocsparse_action_numeric, rocsparse_action_symbolic}; @@ -43,6 +45,15 @@ class parameterized_csr2csc : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_csr2csc_bin : public testing::TestWithParam +{ + protected: + parameterized_csr2csc_bin() {} + virtual ~parameterized_csr2csc_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_csr2csc_arguments(csr2csc_tuple tup) { Arguments arg; @@ -51,9 +62,20 @@ Arguments setup_csr2csc_arguments(csr2csc_tuple tup) arg.action = std::get<2>(tup); arg.idx_base = std::get<3>(tup); arg.timing = 0; + return arg; +} + +Arguments setup_csr2csc_arguments(csr2csc_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.action = std::get<0>(tup); + arg.idx_base = std::get<1>(tup); + arg.timing = 0; // Determine absolute path of test matrix - std::string bin_file = std::get<4>(tup); + std::string bin_file = std::get<2>(tup); // Get current executables absolute path char path_exe[PATH_MAX]; @@ -91,10 +113,31 @@ TEST_P(parameterized_csr2csc, csr2csc_double) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_csr2csc_bin, csr2csc_bin_float) +{ + Arguments arg = setup_csr2csc_arguments(GetParam()); + + rocsparse_status status = testing_csr2csc(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csr2csc_bin, csr2csc_bin_double) +{ + Arguments arg = setup_csr2csc_arguments(GetParam()); + + rocsparse_status status = testing_csr2csc(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(csr2csc, parameterized_csr2csc, testing::Combine(testing::ValuesIn(csr2csc_M_range), testing::ValuesIn(csr2csc_N_range), testing::ValuesIn(csr2csc_action_range), + testing::ValuesIn(csr2csc_csr_base_range))); + +INSTANTIATE_TEST_CASE_P(csr2csc_bin, + parameterized_csr2csc_bin, + testing::Combine(testing::ValuesIn(csr2csc_action_range), testing::ValuesIn(csr2csc_csr_base_range), testing::ValuesIn(csr2csc_bin))); diff --git a/clients/tests/test_csr2ell.cpp b/clients/tests/test_csr2ell.cpp index 4ecc8231..cfd2b99d 100644 --- a/clients/tests/test_csr2ell.cpp +++ b/clients/tests/test_csr2ell.cpp @@ -8,8 +8,10 @@ #include #include #include +#include typedef std::tuple csr2ell_tuple; +typedef std::tuple csr2ell_bin_tuple; int csr2ell_M_range[] = {-1, 0, 10, 500, 872, 1000}; int csr2ell_N_range[] = {-3, 0, 33, 242, 623, 1000}; @@ -19,6 +21,20 @@ rocsparse_index_base csr2ell_csr_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base csr2ell_ell_base_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +std::string csr2ell_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_csr2ell : public testing::TestWithParam { protected: @@ -28,6 +44,15 @@ class parameterized_csr2ell : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_csr2ell_bin : public testing::TestWithParam +{ + protected: + parameterized_csr2ell_bin() {} + virtual ~parameterized_csr2ell_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_csr2ell_arguments(csr2ell_tuple tup) { Arguments arg; @@ -39,6 +64,36 @@ Arguments setup_csr2ell_arguments(csr2ell_tuple tup) return arg; } +Arguments setup_csr2ell_arguments(csr2ell_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.idx_base = std::get<0>(tup); + arg.idx_base2 = std::get<1>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<2>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + TEST(csr2ell_bad_arg, csr2ell) { testing_csr2ell_bad_arg(); } TEST_P(parameterized_csr2ell, csr2ell_float) @@ -57,9 +112,31 @@ TEST_P(parameterized_csr2ell, csr2ell_double) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_csr2ell_bin, csr2ell_bin_float) +{ + Arguments arg = setup_csr2ell_arguments(GetParam()); + + rocsparse_status status = testing_csr2ell(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csr2ell_bin, csr2ell_bin_double) +{ + Arguments arg = setup_csr2ell_arguments(GetParam()); + + rocsparse_status status = testing_csr2ell(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(csr2ell, parameterized_csr2ell, testing::Combine(testing::ValuesIn(csr2ell_M_range), testing::ValuesIn(csr2ell_N_range), testing::ValuesIn(csr2ell_csr_base_range), testing::ValuesIn(csr2ell_ell_base_range))); + +INSTANTIATE_TEST_CASE_P(csr2ell_bin, + parameterized_csr2ell_bin, + testing::Combine(testing::ValuesIn(csr2ell_csr_base_range), + testing::ValuesIn(csr2ell_ell_base_range), + testing::ValuesIn(csr2ell_bin))); diff --git a/clients/tests/test_csr2hyb.cpp b/clients/tests/test_csr2hyb.cpp index 2d4e653b..3b435c7a 100644 --- a/clients/tests/test_csr2hyb.cpp +++ b/clients/tests/test_csr2hyb.cpp @@ -8,8 +8,11 @@ #include #include #include +#include typedef std::tuple csr2hyb_tuple; +typedef std::tuple + csr2hyb_bin_tuple; int csr2hyb_M_range[] = {-1, 0, 10, 500, 872, 1000}; int csr2hyb_N_range[] = {-3, 0, 33, 242, 623, 1000}; @@ -22,6 +25,20 @@ rocsparse_hyb_partition csr2hyb_partition[] = { int csr2hyb_ELL_range[] = {-33, -1, 0, INT32_MAX}; +std::string csr2hyb_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_csr2hyb : public testing::TestWithParam { protected: @@ -31,6 +48,15 @@ class parameterized_csr2hyb : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_csr2hyb_bin : public testing::TestWithParam +{ + protected: + parameterized_csr2hyb_bin() {} + virtual ~parameterized_csr2hyb_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_csr2hyb_arguments(csr2hyb_tuple tup) { Arguments arg; @@ -43,6 +69,37 @@ Arguments setup_csr2hyb_arguments(csr2hyb_tuple tup) return arg; } +Arguments setup_csr2hyb_arguments(csr2hyb_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.idx_base = std::get<0>(tup); + arg.part = std::get<1>(tup); + arg.ell_width = std::get<2>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<3>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + TEST(csr2hyb_bad_arg, csr2hyb) { testing_csr2hyb_bad_arg(); } TEST_P(parameterized_csr2hyb, csr2hyb_float) @@ -61,6 +118,22 @@ TEST_P(parameterized_csr2hyb, csr2hyb_double) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_csr2hyb_bin, csr2hyb_bin_float) +{ + Arguments arg = setup_csr2hyb_arguments(GetParam()); + + rocsparse_status status = testing_csr2hyb(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csr2hyb_bin, csr2hyb_bin_double) +{ + Arguments arg = setup_csr2hyb_arguments(GetParam()); + + rocsparse_status status = testing_csr2hyb(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(csr2hyb, parameterized_csr2hyb, testing::Combine(testing::ValuesIn(csr2hyb_M_range), @@ -68,3 +141,10 @@ INSTANTIATE_TEST_CASE_P(csr2hyb, testing::ValuesIn(csr2hyb_idx_base_range), testing::ValuesIn(csr2hyb_partition), testing::ValuesIn(csr2hyb_ELL_range))); + +INSTANTIATE_TEST_CASE_P(csr2hyb_bin, + parameterized_csr2hyb_bin, + testing::Combine(testing::ValuesIn(csr2hyb_idx_base_range), + testing::ValuesIn(csr2hyb_partition), + testing::ValuesIn(csr2hyb_ELL_range), + testing::ValuesIn(csr2hyb_bin))); diff --git a/clients/tests/test_csrmv.cpp b/clients/tests/test_csrmv.cpp index f02dc39c..f21ecfd7 100644 --- a/clients/tests/test_csrmv.cpp +++ b/clients/tests/test_csrmv.cpp @@ -12,10 +12,11 @@ #include typedef rocsparse_index_base base; -typedef std::tuple csrmv_tuple; +typedef std::tuple csrmv_tuple; +typedef std::tuple csrmv_bin_tuple; -int csr_M_range[] = {-99, -1, 0, 500, 7111}; -int csr_N_range[] = {-99, 0, 842, 4441}; +int csr_M_range[] = {-1, 0, 500, 7111}; +int csr_N_range[] = {-3, 0, 842, 4441}; std::vector csr_alpha_range = {2.0, 3.0}; std::vector csr_beta_range = {0.0, 1.0}; @@ -46,6 +47,15 @@ class parameterized_csrmv : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_csrmv_bin : public testing::TestWithParam +{ + protected: + parameterized_csrmv_bin() {} + virtual ~parameterized_csrmv_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_csrmv_arguments(csrmv_tuple tup) { Arguments arg; @@ -55,9 +65,21 @@ Arguments setup_csrmv_arguments(csrmv_tuple tup) arg.beta = std::get<3>(tup); arg.idx_base = std::get<4>(tup); arg.timing = 0; + return arg; +} + +Arguments setup_csrmv_arguments(csrmv_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.alpha = std::get<0>(tup); + arg.beta = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; // Determine absolute path of test matrix - std::string bin_file = std::get<5>(tup); + std::string bin_file = std::get<3>(tup); // Get current executables absolute path char path_exe[PATH_MAX]; @@ -95,11 +117,33 @@ TEST_P(parameterized_csrmv, csrmv_double) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_csrmv_bin, csrmv_bin_float) +{ + Arguments arg = setup_csrmv_arguments(GetParam()); + + rocsparse_status status = testing_csrmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrmv_bin, csrmv_bin_double) +{ + Arguments arg = setup_csrmv_arguments(GetParam()); + + rocsparse_status status = testing_csrmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(csrmv, parameterized_csrmv, testing::Combine(testing::ValuesIn(csr_M_range), testing::ValuesIn(csr_N_range), testing::ValuesIn(csr_alpha_range), + testing::ValuesIn(csr_beta_range), + testing::ValuesIn(csr_idxbase_range))); + +INSTANTIATE_TEST_CASE_P(csrmv_bin, + parameterized_csrmv_bin, + testing::Combine(testing::ValuesIn(csr_alpha_range), testing::ValuesIn(csr_beta_range), testing::ValuesIn(csr_idxbase_range), testing::ValuesIn(csr_bin))); diff --git a/clients/tests/test_csrsort.cpp b/clients/tests/test_csrsort.cpp index 21f75450..4477e5b6 100644 --- a/clients/tests/test_csrsort.cpp +++ b/clients/tests/test_csrsort.cpp @@ -8,11 +8,13 @@ #include #include #include +#include -typedef std::tuple csrsort_tuple; +typedef std::tuple csrsort_tuple; +typedef std::tuple csrsort_bin_tuple; -int csrsort_M_range[] = {-99, -1, 0, 10, 500, 872, 1000}; -int csrsort_N_range[] = {-99, 0, 33, 242, 623, 1000}; +int csrsort_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int csrsort_N_range[] = {-3, 0, 33, 242, 623, 1000}; int csrsort_perm[] = {0, 1}; rocsparse_index_base csrsort_base[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; @@ -40,6 +42,15 @@ class parameterized_csrsort : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_csrsort_bin : public testing::TestWithParam +{ + protected: + parameterized_csrsort_bin() {} + virtual ~parameterized_csrsort_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_csrsort_arguments(csrsort_tuple tup) { Arguments arg; @@ -48,9 +59,20 @@ Arguments setup_csrsort_arguments(csrsort_tuple tup) arg.temp = std::get<2>(tup); arg.idx_base = std::get<3>(tup); arg.timing = 0; + return arg; +} + +Arguments setup_csrsort_arguments(csrsort_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.temp = std::get<0>(tup); + arg.idx_base = std::get<1>(tup); + arg.timing = 0; // Determine absolute path of test matrix - std::string bin_file = std::get<4>(tup); + std::string bin_file = std::get<2>(tup); // Get current executables absolute path char path_exe[PATH_MAX]; @@ -80,10 +102,23 @@ TEST_P(parameterized_csrsort, csrsort) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_csrsort_bin, csrsort_bin) +{ + Arguments arg = setup_csrsort_arguments(GetParam()); + + rocsparse_status status = testing_csrsort(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(csrsort, parameterized_csrsort, testing::Combine(testing::ValuesIn(csrsort_M_range), testing::ValuesIn(csrsort_N_range), testing::ValuesIn(csrsort_perm), + testing::ValuesIn(csrsort_base))); + +INSTANTIATE_TEST_CASE_P(csrsort_bin, + parameterized_csrsort_bin, + testing::Combine(testing::ValuesIn(csrsort_perm), testing::ValuesIn(csrsort_base), testing::ValuesIn(csrsort_bin))); From 4e5bb45e828532231839038693e5c3f908c35d25 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 30 Jul 2018 14:31:50 +0200 Subject: [PATCH 187/304] real matrix tests for ellmv --- clients/include/testing_ellmv.hpp | 57 ++++++++++++++++++---- clients/tests/test_ellmv.cpp | 79 +++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 8 deletions(-) diff --git a/clients/include/testing_ellmv.hpp b/clients/include/testing_ellmv.hpp index c71df4dc..1be25e9b 100644 --- a/clients/include/testing_ellmv.hpp +++ b/clients/include/testing_ellmv.hpp @@ -132,8 +132,23 @@ rocsparse_status testing_ellmv(Arguments argus) T h_beta = argus.beta; rocsparse_operation trans = argus.trans; rocsparse_index_base idx_base = argus.idx_base; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + std::unique_ptr test_handle(new handle_struct); rocsparse_handle handle = test_handle->handle; @@ -196,19 +211,27 @@ rocsparse_status testing_ellmv(Arguments argus) // Initial Data on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + if(read_bin_matrix(binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcol_ind, hval, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) { m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcol_ind, hval, idx_base); nnz = hcsr_row_ptr[m]; } else { - if(argus.filename != "") + if(filename != "") { if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != 0) + filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } @@ -336,11 +359,29 @@ rocsparse_status testing_ellmv(Arguments argus) for(rocsparse_int i = 0; i < m; ++i) { - hy_gold[i] *= h_beta; - for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; - ++j) + T sum = static_cast(0); + for(rocsparse_int p = 0; p < ell_width; ++p) + { + rocsparse_int idx = ELL_IND(i, p, m, ell_width); + rocsparse_int col = hell_col_ind[idx] - idx_base; + + if(col >= 0 && col < n) + { + sum += hell_val[idx] * hx[col]; + } + else + { + break; + } + } + + if(h_beta != static_cast(0)) + { + hy_gold[i] = h_beta * hy_gold[i] + h_alpha * sum; + } + else { - hy_gold[i] += h_alpha * hval[j] * hx[hcol_ind[j] - idx_base]; + hy_gold[i] = h_alpha * sum; } } diff --git a/clients/tests/test_ellmv.cpp b/clients/tests/test_ellmv.cpp index c5d8da4c..c287343d 100644 --- a/clients/tests/test_ellmv.cpp +++ b/clients/tests/test_ellmv.cpp @@ -8,9 +8,11 @@ #include #include #include +#include typedef rocsparse_index_base base; typedef std::tuple ellmv_tuple; +typedef std::tuple ellmv_bin_tuple; int ell_M_range[] = {-1, 0, 10, 500, 7111, 10000}; int ell_N_range[] = {-3, 0, 33, 842, 4441, 10000}; @@ -20,6 +22,20 @@ std::vector ell_beta_range = {0.0, 0.6}; base ell_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +std::string ell_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_ellmv : public testing::TestWithParam { protected: @@ -29,6 +45,15 @@ class parameterized_ellmv : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_ellmv_bin : public testing::TestWithParam +{ + protected: + parameterized_ellmv_bin() {} + virtual ~parameterized_ellmv_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_ellmv_arguments(ellmv_tuple tup) { Arguments arg; @@ -41,6 +66,37 @@ Arguments setup_ellmv_arguments(ellmv_tuple tup) return arg; } +Arguments setup_ellmv_arguments(ellmv_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.alpha = std::get<0>(tup); + arg.beta = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<3>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + TEST(ellmv_bad_arg, ellmv_float) { testing_ellmv_bad_arg(); } TEST_P(parameterized_ellmv, ellmv_float) @@ -59,6 +115,22 @@ TEST_P(parameterized_ellmv, ellmv_double) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_ellmv_bin, ellmv_bin_float) +{ + Arguments arg = setup_ellmv_arguments(GetParam()); + + rocsparse_status status = testing_ellmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_ellmv_bin, ellmv_bin_double) +{ + Arguments arg = setup_ellmv_arguments(GetParam()); + + rocsparse_status status = testing_ellmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(ellmv, parameterized_ellmv, testing::Combine(testing::ValuesIn(ell_M_range), @@ -66,3 +138,10 @@ INSTANTIATE_TEST_CASE_P(ellmv, testing::ValuesIn(ell_alpha_range), testing::ValuesIn(ell_beta_range), testing::ValuesIn(ell_idxbase_range))); + +INSTANTIATE_TEST_CASE_P(ellmv_bin, + parameterized_ellmv_bin, + testing::Combine(testing::ValuesIn(ell_alpha_range), + testing::ValuesIn(ell_beta_range), + testing::ValuesIn(ell_idxbase_range), + testing::ValuesIn(ell_bin))); From 90ec9c7ce547ba40a6be8ac71c467552126a3dde Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 30 Jul 2018 15:01:44 +0200 Subject: [PATCH 188/304] tests: trans -> transA --- clients/include/testing_coomv.hpp | 32 +++++++++++++++---------------- clients/include/testing_csrmv.hpp | 32 +++++++++++++++---------------- clients/include/testing_ellmv.hpp | 30 ++++++++++++++--------------- clients/include/testing_hybmv.hpp | 28 +++++++++++++-------------- 4 files changed, 61 insertions(+), 61 deletions(-) diff --git a/clients/include/testing_coomv.hpp b/clients/include/testing_coomv.hpp index 5c1b96f6..7bc6ecce 100644 --- a/clients/include/testing_coomv.hpp +++ b/clients/include/testing_coomv.hpp @@ -26,7 +26,7 @@ void testing_coomv_bad_arg(void) rocsparse_int safe_size = 100; T alpha = 0.6; T beta = 0.2; - rocsparse_operation trans = rocsparse_operation_none; + rocsparse_operation transA = rocsparse_operation_none; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); @@ -60,7 +60,7 @@ void testing_coomv_bad_arg(void) rocsparse_int* drow_null = nullptr; status = rocsparse_coomv( - handle, trans, m, n, nnz, &alpha, descr, dval, drow_null, dcol, dx, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr, dval, drow_null, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: drow is nullptr"); } // testing for(nullptr == dcol) @@ -68,7 +68,7 @@ void testing_coomv_bad_arg(void) rocsparse_int* dcol_null = nullptr; status = rocsparse_coomv( - handle, trans, m, n, nnz, &alpha, descr, dval, drow, dcol_null, dx, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr, dval, drow, dcol_null, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } // testing for(nullptr == dval) @@ -76,7 +76,7 @@ void testing_coomv_bad_arg(void) T* dval_null = nullptr; status = rocsparse_coomv( - handle, trans, m, n, nnz, &alpha, descr, dval_null, drow, dcol, dx, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr, dval_null, drow, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); } // testing for(nullptr == dx) @@ -84,7 +84,7 @@ void testing_coomv_bad_arg(void) T* dx_null = nullptr; status = rocsparse_coomv( - handle, trans, m, n, nnz, &alpha, descr, dval, drow, dcol, dx_null, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr, dval, drow, dcol, dx_null, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); } // testing for(nullptr == dy) @@ -92,7 +92,7 @@ void testing_coomv_bad_arg(void) T* dy_null = nullptr; status = rocsparse_coomv( - handle, trans, m, n, nnz, &alpha, descr, dval, drow, dcol, dx, &beta, dy_null); + handle, transA, m, n, nnz, &alpha, descr, dval, drow, dcol, dx, &beta, dy_null); verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); } // testing for(nullptr == d_alpha) @@ -100,7 +100,7 @@ void testing_coomv_bad_arg(void) T* d_alpha_null = nullptr; status = rocsparse_coomv( - handle, trans, m, n, nnz, d_alpha_null, descr, dval, drow, dcol, dx, &beta, dy); + handle, transA, m, n, nnz, d_alpha_null, descr, dval, drow, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for(nullptr == d_beta) @@ -108,7 +108,7 @@ void testing_coomv_bad_arg(void) T* d_beta_null = nullptr; status = rocsparse_coomv( - handle, trans, m, n, nnz, &alpha, descr, dval, drow, dcol, dx, d_beta_null, dy); + handle, transA, m, n, nnz, &alpha, descr, dval, drow, dcol, dx, d_beta_null, dy); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } // testing for(nullptr == descr) @@ -116,7 +116,7 @@ void testing_coomv_bad_arg(void) rocsparse_mat_descr descr_null = nullptr; status = rocsparse_coomv( - handle, trans, m, n, nnz, &alpha, descr_null, dval, drow, dcol, dx, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr_null, dval, drow, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == handle) @@ -124,7 +124,7 @@ void testing_coomv_bad_arg(void) rocsparse_handle handle_null = nullptr; status = rocsparse_coomv( - handle_null, trans, m, n, nnz, &alpha, descr, dval, drow, dcol, dx, &beta, dy); + handle_null, transA, m, n, nnz, &alpha, descr, dval, drow, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_handle(status); } } @@ -137,7 +137,7 @@ rocsparse_status testing_coomv(Arguments argus) rocsparse_int n = argus.N; T h_alpha = argus.alpha; T h_beta = argus.beta; - rocsparse_operation trans = argus.trans; + rocsparse_operation transA = argus.transA; rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; @@ -184,7 +184,7 @@ rocsparse_status testing_coomv(Arguments argus) CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); status = rocsparse_coomv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy); + handle, transA, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy); if(m < 0 || n < 0 || nnz < 0) { @@ -296,12 +296,12 @@ rocsparse_status testing_coomv(Arguments argus) // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); CHECK_ROCSPARSE_ERROR(rocsparse_coomv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy_1)); + handle, transA, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy_1)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); CHECK_ROCSPARSE_ERROR(rocsparse_coomv( - handle, trans, m, n, nnz, d_alpha, descr, dval, drow, dcol, dx, d_beta, dy_2)); + handle, transA, m, n, nnz, d_alpha, descr, dval, drow, dcol, dx, d_beta, dy_2)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); @@ -340,7 +340,7 @@ rocsparse_status testing_coomv(Arguments argus) for(int iter = 0; iter < number_cold_calls; iter++) { rocsparse_coomv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy_1); + handle, transA, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy_1); } double gpu_time_used = get_time_us(); // in microseconds @@ -348,7 +348,7 @@ rocsparse_status testing_coomv(Arguments argus) for(int iter = 0; iter < number_hot_calls; iter++) { rocsparse_coomv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy_1); + handle, transA, m, n, nnz, &h_alpha, descr, dval, drow, dcol, dx, &h_beta, dy_1); } // Convert to miliseconds per call diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index c51e1f9d..ee9c4f04 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -26,7 +26,7 @@ void testing_csrmv_bad_arg(void) rocsparse_int safe_size = 100; T alpha = 0.6; T beta = 0.2; - rocsparse_operation trans = rocsparse_operation_none; + rocsparse_operation transA = rocsparse_operation_none; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); @@ -60,7 +60,7 @@ void testing_csrmv_bad_arg(void) rocsparse_int* dptr_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr_null, dcol, dx, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr, dval, dptr_null, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); } // testing for(nullptr == dcol) @@ -68,7 +68,7 @@ void testing_csrmv_bad_arg(void) rocsparse_int* dcol_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol_null, dx, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol_null, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } // testing for(nullptr == dval) @@ -76,7 +76,7 @@ void testing_csrmv_bad_arg(void) T* dval_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval_null, dptr, dcol, dx, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr, dval_null, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); } // testing for(nullptr == dx) @@ -84,7 +84,7 @@ void testing_csrmv_bad_arg(void) T* dx_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx_null, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx_null, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); } // testing for(nullptr == dy) @@ -92,7 +92,7 @@ void testing_csrmv_bad_arg(void) T* dy_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy_null); + handle, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy_null); verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); } // testing for(nullptr == d_alpha) @@ -100,7 +100,7 @@ void testing_csrmv_bad_arg(void) T* d_alpha_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, d_alpha_null, descr, dval, dptr, dcol, dx, &beta, dy); + handle, transA, m, n, nnz, d_alpha_null, descr, dval, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for(nullptr == d_beta) @@ -108,7 +108,7 @@ void testing_csrmv_bad_arg(void) T* d_beta_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, d_beta_null, dy); + handle, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, d_beta_null, dy); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } // testing for(nullptr == descr) @@ -116,7 +116,7 @@ void testing_csrmv_bad_arg(void) rocsparse_mat_descr descr_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr_null, dval, dptr, dcol, dx, &beta, dy); + handle, transA, m, n, nnz, &alpha, descr_null, dval, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == handle) @@ -124,7 +124,7 @@ void testing_csrmv_bad_arg(void) rocsparse_handle handle_null = nullptr; status = rocsparse_csrmv( - handle_null, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy); + handle_null, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy); verify_rocsparse_status_invalid_handle(status); } } @@ -137,7 +137,7 @@ rocsparse_status testing_csrmv(Arguments argus) rocsparse_int n = argus.N; T h_alpha = argus.alpha; T h_beta = argus.beta; - rocsparse_operation trans = argus.trans; + rocsparse_operation transA = argus.transA; rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; @@ -184,7 +184,7 @@ rocsparse_status testing_csrmv(Arguments argus) CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); status = rocsparse_csrmv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy); + handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy); if(m < 0 || n < 0 || nnz < 0) { @@ -303,12 +303,12 @@ rocsparse_status testing_csrmv(Arguments argus) // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); CHECK_ROCSPARSE_ERROR(rocsparse_csrmv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1)); + handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); CHECK_ROCSPARSE_ERROR(rocsparse_csrmv( - handle, trans, m, n, nnz, d_alpha, descr, dval, dptr, dcol, dx, d_beta, dy_2)); + handle, transA, m, n, nnz, d_alpha, descr, dval, dptr, dcol, dx, d_beta, dy_2)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); @@ -347,7 +347,7 @@ rocsparse_status testing_csrmv(Arguments argus) for(int iter = 0; iter < number_cold_calls; iter++) { rocsparse_csrmv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); + handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); } double gpu_time_used = get_time_us(); // in microseconds @@ -355,7 +355,7 @@ rocsparse_status testing_csrmv(Arguments argus) for(int iter = 0; iter < number_hot_calls; iter++) { rocsparse_csrmv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); + handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); } // Convert to miliseconds per call diff --git a/clients/include/testing_ellmv.hpp b/clients/include/testing_ellmv.hpp index e09c375d..4cd34b85 100644 --- a/clients/include/testing_ellmv.hpp +++ b/clients/include/testing_ellmv.hpp @@ -30,7 +30,7 @@ void testing_ellmv_bad_arg(void) rocsparse_int ell_width = 8; T alpha = 0.6; T beta = 0.2; - rocsparse_operation trans = rocsparse_operation_none; + rocsparse_operation transA = rocsparse_operation_none; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); @@ -61,7 +61,7 @@ void testing_ellmv_bad_arg(void) rocsparse_int* dcol_null = nullptr; status = rocsparse_ellmv( - handle, trans, m, n, &alpha, descr, dval, dcol_null, ell_width, dx, &beta, dy); + handle, transA, m, n, &alpha, descr, dval, dcol_null, ell_width, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } // testing for(nullptr == dval) @@ -69,7 +69,7 @@ void testing_ellmv_bad_arg(void) T* dval_null = nullptr; status = rocsparse_ellmv( - handle, trans, m, n, &alpha, descr, dval_null, dcol, ell_width, dx, &beta, dy); + handle, transA, m, n, &alpha, descr, dval_null, dcol, ell_width, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); } // testing for(nullptr == dx) @@ -77,7 +77,7 @@ void testing_ellmv_bad_arg(void) T* dx_null = nullptr; status = rocsparse_ellmv( - handle, trans, m, n, &alpha, descr, dval, dcol, ell_width, dx_null, &beta, dy); + handle, transA, m, n, &alpha, descr, dval, dcol, ell_width, dx_null, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); } // testing for(nullptr == dy) @@ -85,7 +85,7 @@ void testing_ellmv_bad_arg(void) T* dy_null = nullptr; status = rocsparse_ellmv( - handle, trans, m, n, &alpha, descr, dval, dcol, ell_width, dx, &beta, dy_null); + handle, transA, m, n, &alpha, descr, dval, dcol, ell_width, dx, &beta, dy_null); verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); } // testing for(nullptr == d_alpha) @@ -93,7 +93,7 @@ void testing_ellmv_bad_arg(void) T* d_alpha_null = nullptr; status = rocsparse_ellmv( - handle, trans, m, n, d_alpha_null, descr, dval, dcol, ell_width, dx, &beta, dy); + handle, transA, m, n, d_alpha_null, descr, dval, dcol, ell_width, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for(nullptr == d_beta) @@ -101,7 +101,7 @@ void testing_ellmv_bad_arg(void) T* d_beta_null = nullptr; status = rocsparse_ellmv( - handle, trans, m, n, &alpha, descr, dval, dcol, ell_width, dx, d_beta_null, dy); + handle, transA, m, n, &alpha, descr, dval, dcol, ell_width, dx, d_beta_null, dy); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } // testing for(nullptr == descr) @@ -109,7 +109,7 @@ void testing_ellmv_bad_arg(void) rocsparse_mat_descr descr_null = nullptr; status = rocsparse_ellmv( - handle, trans, m, n, &alpha, descr_null, dval, dcol, ell_width, dx, &beta, dy); + handle, transA, m, n, &alpha, descr_null, dval, dcol, ell_width, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == handle) @@ -117,7 +117,7 @@ void testing_ellmv_bad_arg(void) rocsparse_handle handle_null = nullptr; status = rocsparse_ellmv( - handle_null, trans, m, n, &alpha, descr, dval, dcol, ell_width, dx, &beta, dy); + handle_null, transA, m, n, &alpha, descr, dval, dcol, ell_width, dx, &beta, dy); verify_rocsparse_status_invalid_handle(status); } } @@ -130,7 +130,7 @@ rocsparse_status testing_ellmv(Arguments argus) rocsparse_int n = argus.N; T h_alpha = argus.alpha; T h_beta = argus.beta; - rocsparse_operation trans = argus.trans; + rocsparse_operation transA = argus.transA; rocsparse_index_base idx_base = argus.idx_base; rocsparse_status status; @@ -174,7 +174,7 @@ rocsparse_status testing_ellmv(Arguments argus) CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); status = - rocsparse_ellmv(handle, trans, m, n, &h_alpha, descr, dval, dcol, 0, dx, &h_beta, dy); + rocsparse_ellmv(handle, transA, m, n, &h_alpha, descr, dval, dcol, 0, dx, &h_beta, dy); if(m < 0 || n < 0 || nnz < 0) { @@ -320,12 +320,12 @@ rocsparse_status testing_ellmv(Arguments argus) // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); CHECK_ROCSPARSE_ERROR(rocsparse_ellmv( - handle, trans, m, n, &h_alpha, descr, dval, dcol, ell_width, dx, &h_beta, dy_1)); + handle, transA, m, n, &h_alpha, descr, dval, dcol, ell_width, dx, &h_beta, dy_1)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); CHECK_ROCSPARSE_ERROR(rocsparse_ellmv( - handle, trans, m, n, d_alpha, descr, dval, dcol, ell_width, dx, d_beta, dy_2)); + handle, transA, m, n, d_alpha, descr, dval, dcol, ell_width, dx, d_beta, dy_2)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); @@ -361,7 +361,7 @@ rocsparse_status testing_ellmv(Arguments argus) for(int iter = 0; iter < number_cold_calls; iter++) { rocsparse_ellmv( - handle, trans, m, n, &h_alpha, descr, dval, dcol, ell_width, dx, &h_beta, dy_1); + handle, transA, m, n, &h_alpha, descr, dval, dcol, ell_width, dx, &h_beta, dy_1); } double gpu_time_used = get_time_us(); // in microseconds @@ -369,7 +369,7 @@ rocsparse_status testing_ellmv(Arguments argus) for(int iter = 0; iter < number_hot_calls; iter++) { rocsparse_ellmv( - handle, trans, m, n, &h_alpha, descr, dval, dcol, ell_width, dx, &h_beta, dy_1); + handle, transA, m, n, &h_alpha, descr, dval, dcol, ell_width, dx, &h_beta, dy_1); } // Convert to miliseconds per call diff --git a/clients/include/testing_hybmv.hpp b/clients/include/testing_hybmv.hpp index 1c5f67ad..40234b80 100644 --- a/clients/include/testing_hybmv.hpp +++ b/clients/include/testing_hybmv.hpp @@ -38,7 +38,7 @@ void testing_hybmv_bad_arg(void) rocsparse_int safe_size = 100; T alpha = 0.6; T beta = 0.2; - rocsparse_operation trans = rocsparse_operation_none; + rocsparse_operation transA = rocsparse_operation_none; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); @@ -66,49 +66,49 @@ void testing_hybmv_bad_arg(void) { T* dx_null = nullptr; - status = rocsparse_hybmv(handle, trans, &alpha, descr, hyb, dx_null, &beta, dy); + status = rocsparse_hybmv(handle, transA, &alpha, descr, hyb, dx_null, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); } // testing for(nullptr == dy) { T* dy_null = nullptr; - status = rocsparse_hybmv(handle, trans, &alpha, descr, hyb, dx, &beta, dy_null); + status = rocsparse_hybmv(handle, transA, &alpha, descr, hyb, dx, &beta, dy_null); verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); } // testing for(nullptr == d_alpha) { T* d_alpha_null = nullptr; - status = rocsparse_hybmv(handle, trans, d_alpha_null, descr, hyb, dx, &beta, dy); + status = rocsparse_hybmv(handle, transA, d_alpha_null, descr, hyb, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for(nullptr == d_beta) { T* d_beta_null = nullptr; - status = rocsparse_hybmv(handle, trans, &alpha, descr, hyb, dx, d_beta_null, dy); + status = rocsparse_hybmv(handle, transA, &alpha, descr, hyb, dx, d_beta_null, dy); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } // testing for(nullptr == hyb) { rocsparse_hyb_mat hyb_null = nullptr; - status = rocsparse_hybmv(handle, trans, &alpha, descr, hyb_null, dx, &beta, dy); + status = rocsparse_hybmv(handle, transA, &alpha, descr, hyb_null, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == descr) { rocsparse_mat_descr descr_null = nullptr; - status = rocsparse_hybmv(handle, trans, &alpha, descr_null, hyb, dx, &beta, dy); + status = rocsparse_hybmv(handle, transA, &alpha, descr_null, hyb, dx, &beta, dy); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == handle) { rocsparse_handle handle_null = nullptr; - status = rocsparse_hybmv(handle_null, trans, &alpha, descr, hyb, dx, &beta, dy); + status = rocsparse_hybmv(handle_null, transA, &alpha, descr, hyb, dx, &beta, dy); verify_rocsparse_status_invalid_handle(status); } } @@ -121,7 +121,7 @@ rocsparse_status testing_hybmv(Arguments argus) rocsparse_int n = argus.N; T h_alpha = argus.alpha; T h_beta = argus.beta; - rocsparse_operation trans = argus.trans; + rocsparse_operation transA = argus.transA; rocsparse_index_base idx_base = argus.idx_base; rocsparse_hyb_partition part = argus.part; rocsparse_int user_ell_width = argus.ell_width; @@ -182,7 +182,7 @@ rocsparse_status testing_hybmv(Arguments argus) // hybmv should be able to deal with m <= 0 || n <= 0 || nnz <= 0 even if csr2hyb fails // because hyb structures is allocated with n = m = 0 - so nothing should happen - status = rocsparse_hybmv(handle, trans, &h_alpha, descr, hyb, dx, &h_beta, dy); + status = rocsparse_hybmv(handle, transA, &h_alpha, descr, hyb, dx, &h_beta, dy); verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); return rocsparse_status_success; @@ -300,12 +300,12 @@ rocsparse_status testing_hybmv(Arguments argus) // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); CHECK_ROCSPARSE_ERROR( - rocsparse_hybmv(handle, trans, &h_alpha, descr, hyb, dx, &h_beta, dy_1)); + rocsparse_hybmv(handle, transA, &h_alpha, descr, hyb, dx, &h_beta, dy_1)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); CHECK_ROCSPARSE_ERROR( - rocsparse_hybmv(handle, trans, d_alpha, descr, hyb, dx, d_beta, dy_2)); + rocsparse_hybmv(handle, transA, d_alpha, descr, hyb, dx, d_beta, dy_2)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); @@ -338,14 +338,14 @@ rocsparse_status testing_hybmv(Arguments argus) for(int iter = 0; iter < number_cold_calls; iter++) { - rocsparse_hybmv(handle, trans, &h_alpha, descr, hyb, dx, &h_beta, dy_1); + rocsparse_hybmv(handle, transA, &h_alpha, descr, hyb, dx, &h_beta, dy_1); } double gpu_time_used = get_time_us(); // in microseconds for(int iter = 0; iter < number_hot_calls; iter++) { - rocsparse_hybmv(handle, trans, &h_alpha, descr, hyb, dx, &h_beta, dy_1); + rocsparse_hybmv(handle, transA, &h_alpha, descr, hyb, dx, &h_beta, dy_1); } testhyb* dhyb = (testhyb*)hyb; From 113385b77be910879fd158a5e9816327cc3d4cc5 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 30 Jul 2018 15:02:25 +0200 Subject: [PATCH 189/304] tests: Argument class adjusted --- clients/include/utility.hpp | 44 +++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index 31bb4254..c8035f1d 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -595,12 +595,17 @@ class Arguments public: rocsparse_int M = 128; rocsparse_int N = 128; + rocsparse_int K = 128; rocsparse_int nnz = 32; + rocsparse_int ldb; + rocsparse_int ldc; + double alpha = 1.0; double beta = 0.0; - rocsparse_operation trans = rocsparse_operation_none; + rocsparse_operation transA = rocsparse_operation_none; + rocsparse_operation transB = rocsparse_operation_none; rocsparse_index_base idx_base = rocsparse_index_base_zero; rocsparse_index_base idx_base2 = rocsparse_index_base_zero; rocsparse_hyb_partition part = rocsparse_hyb_partition_auto; @@ -617,27 +622,32 @@ class Arguments Arguments& operator=(const Arguments& rhs) { - M = rhs.M; - N = rhs.N; - nnz = rhs.nnz; + this->M = rhs.M; + this->N = rhs.N; + this->K = rhs.K; + this->nnz = rhs.nnz; + + this->ldb = rhs.ldb; + this->ldc = rhs.ldc; - alpha = rhs.alpha; - beta = rhs.beta; + this->alpha = rhs.alpha; + this->beta = rhs.beta; - trans = rhs.trans; - idx_base = rhs.idx_base; - idx_base2 = rhs.idx_base2; - part = rhs.part; + this->transA = rhs.transA; + this->transB = rhs.transB; + this->idx_base = rhs.idx_base; + this->idx_base2 = rhs.idx_base2; + this->part = rhs.part; - norm_check = rhs.norm_check; - unit_check = rhs.unit_check; - timing = rhs.timing; + this->norm_check = rhs.norm_check; + this->unit_check = rhs.unit_check; + this->timing = rhs.timing; - iters = rhs.iters; - laplacian = rhs.laplacian; - ell_width = rhs.ell_width; + this->iters = rhs.iters; + this->laplacian = rhs.laplacian; + this->ell_width = rhs.ell_width; - filename = rhs.filename; + this->filename = rhs.filename; return *this; } From 6cd06a3f8e2ee140d3644905339eca8d9af8e0d7 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 10:42:49 +0200 Subject: [PATCH 190/304] fixed an issue with unit check, where values were left out when checking 2D arrays --- clients/common/unit.cpp | 66 ++++++++++++++++++++++------ clients/include/testing_axpyi.hpp | 4 +- clients/include/testing_coo2csr.hpp | 2 +- clients/include/testing_coosort.hpp | 8 ++-- clients/include/testing_csr2coo.hpp | 2 +- clients/include/testing_csr2csc.hpp | 8 ++-- clients/include/testing_csr2ell.hpp | 8 ++-- clients/include/testing_csrmv.hpp | 4 +- clients/include/testing_csrsort.hpp | 6 +-- clients/include/testing_doti.hpp | 4 +- clients/include/testing_ellmv.hpp | 4 +- clients/include/testing_gthr.hpp | 2 +- clients/include/testing_gthrz.hpp | 4 +- clients/include/testing_identity.hpp | 2 +- clients/include/testing_roti.hpp | 8 ++-- clients/include/testing_sctr.hpp | 2 +- clients/include/unit.hpp | 7 ++- 17 files changed, 92 insertions(+), 49 deletions(-) diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index d7533216..87965860 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -6,6 +6,7 @@ #include #include +#include #ifdef GOOGLE_TEST #include @@ -23,69 +24,108 @@ * ==================================================== */ /*! \brief Template: gtest unit compare two matrices float/double/complex */ -// Do not put a wrapper over ASSERT_FLOAT_EQ, sincer assert exit the current function NOT the test +// Do not put a wrapper over ASSERT_FLOAT_EQ, since assert exit the current function NOT the test // case // a wrapper will cause the loop keep going template <> -void unit_check_general(rocsparse_int M, rocsparse_int N, float* hCPU, float* hGPU) +void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, float* hCPU, float* hGPU) { for(rocsparse_int j = 0; j < N; j++) { for(rocsparse_int i = 0; i < M; i++) { #ifdef GOOGLE_TEST - ASSERT_FLOAT_EQ(hCPU[i + j], hGPU[i + j]); + ASSERT_FLOAT_EQ(hCPU[i + j * lda], hGPU[i + j * lda]); #else - assert(hCPU[i + j] == hGPU[i + j]); + assert(hCPU[i + j * lda] == hGPU[i + j * lda]); #endif } } } template <> -void unit_check_general(rocsparse_int M, rocsparse_int N, double* hCPU, double* hGPU) +void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, double* hCPU, double* hGPU) { for(rocsparse_int j = 0; j < N; j++) { for(rocsparse_int i = 0; i < M; i++) { #ifdef GOOGLE_TEST - ASSERT_DOUBLE_EQ(hCPU[i + j], hGPU[i + j]); + ASSERT_DOUBLE_EQ(hCPU[i + j * lda], hGPU[i + j * lda]); #else - assert(hCPU[i + j] == hGPU[i + j]); + assert(hCPU[i + j * lda] == hGPU[i + j * lda]); #endif } } } template <> -void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int* hCPU, rocsparse_int* hGPU) +void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, rocsparse_int* hCPU, rocsparse_int* hGPU) { for(rocsparse_int j = 0; j < N; j++) { for(rocsparse_int i = 0; i < M; i++) { #ifdef GOOGLE_TEST - ASSERT_EQ(hCPU[i + j], hGPU[i + j]); + ASSERT_EQ(hCPU[i + j * lda], hGPU[i + j * lda]); #else - assert(hCPU[i + j] == hGPU[i + j]); + assert(hCPU[i + j * lda] == hGPU[i + j * lda]); #endif } } } template <> -void unit_check_general(rocsparse_int M, rocsparse_int N, size_t* hCPU, size_t* hGPU) +void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, size_t* hCPU, size_t* hGPU) { for(rocsparse_int j = 0; j < N; j++) { for(rocsparse_int i = 0; i < M; i++) { #ifdef GOOGLE_TEST - ASSERT_EQ(hCPU[i + j], hGPU[i + j]); + ASSERT_EQ(hCPU[i + j * lda], hGPU[i + j * lda]); #else - assert(hCPU[i + j] == hGPU[i + j]); + assert(hCPU[i + j * lda] == hGPU[i + j * lda]); +#endif + } + } +} + +/*! \brief Template: gtest unit compare two matrices float/double/complex */ +// Do not put a wrapper over ASSERT_FLOAT_EQ, since assert exit the current function NOT the test +// case +// a wrapper will cause the loop keep going + +template <> +void unit_check_near(rocsparse_int M, rocsparse_int N, rocsparse_int lda, float* hCPU, float* hGPU) +{ + for(rocsparse_int j = 0; j < N; j++) + { + for(rocsparse_int i = 0; i < M; i++) + { + float compare_val = std::max(std::abs(hCPU[i + j * lda] * 1e-3f), 10 * std::numeric_limits::epsilon()); +#ifdef GOOGLE_TEST + ASSERT_NEAR(hCPU[i + j * lda], hGPU[i + j * lda], compare_val); +#else + assert(std::abs(hCPU[i + j * lda] - hGPU[i + j * lda]) < compare_val); +#endif + } + } +} + +template <> +void unit_check_near(rocsparse_int M, rocsparse_int N, rocsparse_int lda, double* hCPU, double* hGPU) +{ + for(rocsparse_int j = 0; j < N; j++) + { + for(rocsparse_int i = 0; i < M; i++) + { + double compare_val = std::max(std::abs(hCPU[i + j * lda] * 1e-12), 10 * std::numeric_limits::epsilon()); +#ifdef GOOGLE_TEST + ASSERT_NEAR(hCPU[i + j * lda], hGPU[i + j * lda], compare_val); +#else + assert(std::abs(hCPU[i + j * lda] - hGPU[i + j * lda]) < compare_val); #endif } } diff --git a/clients/include/testing_axpyi.hpp b/clients/include/testing_axpyi.hpp index 131fe462..283208bb 100644 --- a/clients/include/testing_axpyi.hpp +++ b/clients/include/testing_axpyi.hpp @@ -204,8 +204,8 @@ rocsparse_status testing_axpyi(Arguments argus) // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, hy_gold.data(), hy_1.data()); - unit_check_general(1, N, hy_gold.data(), hy_2.data()); + unit_check_general(1, N, 1, hy_gold.data(), hy_1.data()); + unit_check_general(1, N, 1, hy_gold.data(), hy_2.data()); } } diff --git a/clients/include/testing_coo2csr.hpp b/clients/include/testing_coo2csr.hpp index 3dd24c98..ffa14f39 100644 --- a/clients/include/testing_coo2csr.hpp +++ b/clients/include/testing_coo2csr.hpp @@ -246,7 +246,7 @@ rocsparse_status testing_coo2csr(Arguments argus) cpu_time_used = get_time_us() - cpu_time_used; // Unit check - unit_check_general(1, m + 1, hcsr_row_ptr_gold.data(), hcsr_row_ptr.data()); + unit_check_general(1, m + 1, 1, hcsr_row_ptr_gold.data(), hcsr_row_ptr.data()); } if(argus.timing) diff --git a/clients/include/testing_coosort.hpp b/clients/include/testing_coosort.hpp index eca9c7d9..9814b870 100644 --- a/clients/include/testing_coosort.hpp +++ b/clients/include/testing_coosort.hpp @@ -240,7 +240,7 @@ rocsparse_status testing_coosort(Arguments argus) // Buffer size should be zero size_t zero = 0; - unit_check_general(1, 1, &zero, &buffer_size); + unit_check_general(1, 1, 1, &zero, &buffer_size); } if(by_row) @@ -488,12 +488,12 @@ rocsparse_status testing_coosort(Arguments argus) } // Unit check - unit_check_general(1, nnz, hcoo_row_ind.data(), hcoo_row_ind_unsorted.data()); - unit_check_general(1, nnz, hcoo_col_ind.data(), hcoo_col_ind_unsorted.data()); + unit_check_general(1, nnz, 1, hcoo_row_ind.data(), hcoo_row_ind_unsorted.data()); + unit_check_general(1, nnz, 1, hcoo_col_ind.data(), hcoo_col_ind_unsorted.data()); if(permute) { - unit_check_general(1, nnz, hcoo_val.data(), hcoo_val_unsorted.data()); + unit_check_general(1, nnz, 1, hcoo_val.data(), hcoo_val_unsorted.data()); } } diff --git a/clients/include/testing_csr2coo.hpp b/clients/include/testing_csr2coo.hpp index 8758005b..a38f98d1 100644 --- a/clients/include/testing_csr2coo.hpp +++ b/clients/include/testing_csr2coo.hpp @@ -228,7 +228,7 @@ rocsparse_status testing_csr2coo(Arguments argus) } // Unit check - unit_check_general(1, nnz, hcoo_row_ind_gold.data(), hcoo_row_ind.data()); + unit_check_general(1, nnz, 1, hcoo_row_ind_gold.data(), hcoo_row_ind.data()); } if(argus.timing) diff --git a/clients/include/testing_csr2csc.hpp b/clients/include/testing_csr2csc.hpp index 250df296..2070671e 100644 --- a/clients/include/testing_csr2csc.hpp +++ b/clients/include/testing_csr2csc.hpp @@ -352,7 +352,7 @@ rocsparse_status testing_csr2csc(Arguments argus) // Buffer size should be zero size_t four = 4; - unit_check_general(1, 1, &four, &size); + unit_check_general(1, 1, 1, &four, &size); } status = rocsparse_csr2csc(handle, @@ -560,13 +560,13 @@ rocsparse_status testing_csr2csc(Arguments argus) hcsc_col_ptr_gold[0] = idx_base; // Unit check - unit_check_general(1, nnz, hcsc_row_ind_gold.data(), hcsc_row_ind.data()); - unit_check_general(1, n + 1, hcsc_col_ptr_gold.data(), hcsc_col_ptr.data()); + unit_check_general(1, nnz, 1, hcsc_row_ind_gold.data(), hcsc_row_ind.data()); + unit_check_general(1, n + 1, 1, hcsc_col_ptr_gold.data(), hcsc_col_ptr.data()); // If action == rocsparse_action_numeric also check values if(action == rocsparse_action_numeric) { - unit_check_general(1, nnz, hcsc_val_gold.data(), hcsc_val.data()); + unit_check_general(1, nnz, 1, hcsc_val_gold.data(), hcsc_val.data()); } } diff --git a/clients/include/testing_csr2ell.hpp b/clients/include/testing_csr2ell.hpp index bfef7cd3..73d5dfc2 100644 --- a/clients/include/testing_csr2ell.hpp +++ b/clients/include/testing_csr2ell.hpp @@ -518,8 +518,8 @@ rocsparse_status testing_csr2ell(Arguments argus) rocsparse_int ell_nnz = ell_width * m; // Check if ELL width does match - unit_check_general(1, 1, &ell_width_gold, &ell_width); - unit_check_general(1, 1, &ell_nnz_gold, &ell_nnz); + unit_check_general(1, 1, 1, &ell_width_gold, &ell_width); + unit_check_general(1, 1, 1, &ell_nnz_gold, &ell_nnz); // Allocate ELL device memory auto dell_col_ind_managed = @@ -550,8 +550,8 @@ rocsparse_status testing_csr2ell(Arguments argus) hipMemcpy(hell_val.data(), dell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); // Unit check - unit_check_general(1, ell_nnz, hell_col_ind_gold.data(), hell_col_ind.data()); - unit_check_general(1, ell_nnz, hell_val_gold.data(), hell_val.data()); + unit_check_general(1, ell_nnz, 1, hell_col_ind_gold.data(), hell_col_ind.data()); + unit_check_general(1, ell_nnz, 1, hell_val_gold.data(), hell_val.data()); } if(argus.timing) diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index d125de25..69e69670 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -417,8 +417,8 @@ rocsparse_status testing_csrmv(Arguments argus) cpu_time_used = get_time_us() - cpu_time_used; - unit_check_general(1, m, hy_gold.data(), hy_1.data()); - unit_check_general(1, m, hy_gold.data(), hy_2.data()); + unit_check_general(1, m, 1, hy_gold.data(), hy_1.data()); + unit_check_general(1, m, 1, hy_gold.data(), hy_2.data()); } if(argus.timing) diff --git a/clients/include/testing_csrsort.hpp b/clients/include/testing_csrsort.hpp index 32f37c5f..1e2eab70 100644 --- a/clients/include/testing_csrsort.hpp +++ b/clients/include/testing_csrsort.hpp @@ -219,7 +219,7 @@ rocsparse_status testing_csrsort(Arguments argus) // Buffer size should be zero size_t zero = 0; - unit_check_general(1, 1, &zero, &buffer_size); + unit_check_general(1, 1, 1, &zero, &buffer_size); } status = @@ -408,11 +408,11 @@ rocsparse_status testing_csrsort(Arguments argus) } // Unit check - unit_check_general(1, nnz, hcsr_col_ind.data(), hcsr_col_ind_unsorted.data()); + unit_check_general(1, nnz, 1, hcsr_col_ind.data(), hcsr_col_ind_unsorted.data()); if(permute) { - unit_check_general(1, nnz, hcsr_val.data(), hcsr_val_unsorted.data()); + unit_check_general(1, nnz, 1, hcsr_val.data(), hcsr_val_unsorted.data()); } } diff --git a/clients/include/testing_doti.hpp b/clients/include/testing_doti.hpp index 16e44aa4..e0775d78 100644 --- a/clients/include/testing_doti.hpp +++ b/clients/include/testing_doti.hpp @@ -202,8 +202,8 @@ rocsparse_status testing_doti(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - unit_check_general(1, 1, &hresult_gold, &hresult_1); - unit_check_general(1, 1, &hresult_gold, &hresult_2); + unit_check_general(1, 1, 1, &hresult_gold, &hresult_1); + unit_check_general(1, 1, 1, &hresult_gold, &hresult_2); } if(argus.timing) diff --git a/clients/include/testing_ellmv.hpp b/clients/include/testing_ellmv.hpp index 1be25e9b..8bc5c828 100644 --- a/clients/include/testing_ellmv.hpp +++ b/clients/include/testing_ellmv.hpp @@ -389,8 +389,8 @@ rocsparse_status testing_ellmv(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - unit_check_general(1, m, hy_gold.data(), hy_1.data()); - unit_check_general(1, m, hy_gold.data(), hy_2.data()); + unit_check_general(1, m, 1, hy_gold.data(), hy_1.data()); + unit_check_general(1, m, 1, hy_gold.data(), hy_2.data()); } if(argus.timing) diff --git a/clients/include/testing_gthr.hpp b/clients/include/testing_gthr.hpp index 9cefd4a8..74f6baa6 100644 --- a/clients/include/testing_gthr.hpp +++ b/clients/include/testing_gthr.hpp @@ -173,7 +173,7 @@ rocsparse_status testing_gthr(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - unit_check_general(1, nnz, hx_val_gold.data(), hx_val.data()); + unit_check_general(1, nnz, 1, hx_val_gold.data(), hx_val.data()); } if(argus.timing) diff --git a/clients/include/testing_gthrz.hpp b/clients/include/testing_gthrz.hpp index 072e1fa3..f59595cf 100644 --- a/clients/include/testing_gthrz.hpp +++ b/clients/include/testing_gthrz.hpp @@ -178,8 +178,8 @@ rocsparse_status testing_gthrz(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - unit_check_general(1, nnz, hx_val_gold.data(), hx_val.data()); - unit_check_general(1, N, hy_gold.data(), hy.data()); + unit_check_general(1, nnz, 1, hx_val_gold.data(), hx_val.data()); + unit_check_general(1, N, 1, hy_gold.data(), hy.data()); } if(argus.timing) diff --git a/clients/include/testing_identity.hpp b/clients/include/testing_identity.hpp index c0d969bc..f29e7a08 100644 --- a/clients/include/testing_identity.hpp +++ b/clients/include/testing_identity.hpp @@ -120,7 +120,7 @@ rocsparse_status testing_identity(Arguments argus) CHECK_HIP_ERROR(hipMemcpy(hp.data(), dp, sizeof(rocsparse_int) * n, hipMemcpyDeviceToHost)); // Unit check - unit_check_general(1, n, hp_gold.data(), hp.data()); + unit_check_general(1, n, 1, hp_gold.data(), hp.data()); } if(argus.timing) diff --git a/clients/include/testing_roti.hpp b/clients/include/testing_roti.hpp index b2e6ebe2..8336209a 100644 --- a/clients/include/testing_roti.hpp +++ b/clients/include/testing_roti.hpp @@ -234,10 +234,10 @@ rocsparse_status testing_roti(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - unit_check_general(1, nnz, hx_val_gold.data(), hx_val_1.data()); - unit_check_general(1, nnz, hx_val_gold.data(), hx_val_2.data()); - unit_check_general(1, N, hy_gold.data(), hy_1.data()); - unit_check_general(1, N, hy_gold.data(), hy_2.data()); + unit_check_general(1, nnz, 1, hx_val_gold.data(), hx_val_1.data()); + unit_check_general(1, nnz, 1, hx_val_gold.data(), hx_val_2.data()); + unit_check_general(1, N, 1, hy_gold.data(), hy_1.data()); + unit_check_general(1, N, 1, hy_gold.data(), hy_2.data()); } if(argus.timing) diff --git a/clients/include/testing_sctr.hpp b/clients/include/testing_sctr.hpp index b7a781e9..3afe9926 100644 --- a/clients/include/testing_sctr.hpp +++ b/clients/include/testing_sctr.hpp @@ -177,7 +177,7 @@ rocsparse_status testing_sctr(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - unit_check_general(1, N, hy_gold.data(), hy.data()); + unit_check_general(1, N, 1, hy_gold.data(), hy.data()); } if(argus.timing) diff --git a/clients/include/unit.hpp b/clients/include/unit.hpp index da76ba6f..07d78abe 100644 --- a/clients/include/unit.hpp +++ b/clients/include/unit.hpp @@ -23,10 +23,13 @@ * ==================================================== */ /*! \brief Template: gtest unit compare two matrices float/double/complex */ -// Do not put a wrapper over ASSERT_FLOAT_EQ, sincer assert exit the current function NOT the test +// Do not put a wrapper over ASSERT_FLOAT_EQ, since assert exit the current function NOT the test // case // a wrapper will cause the loop keep going template -void unit_check_general(rocsparse_int M, rocsparse_int N, T* hCPU, T* hGPU); +void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, T* hCPU, T* hGPU); + +template +void unit_check_near(rocsparse_int M, rocsparse_int N, rocsparse_int lda, T* hCPU, T* hGPU); #endif // UNIT_HPP From 52be15a5237ab7ff890173800b8d6ae3fb1a9770 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 10:43:15 +0200 Subject: [PATCH 191/304] updated test matrix conversion tool --- deps/convert | Bin 23000 -> 23000 bytes deps/convert.cpp | 4 ++++ 2 files changed, 4 insertions(+) diff --git a/deps/convert b/deps/convert index 0d80c004d97acfd8e425f51a0ec8a3f172775563..1395a45d6e25951dcd55c8e1488d0749d5828fc4 100755 GIT binary patch delta 636 zcmYLHT}YE*6h3E8bcTLgep}xPYv9OqbDOcSsPJ<*rXc#+k5^I=Gz)^15ThHr$gsR< zc8Hb)*^OWX{(LtDkxCaTL9IJWDm4PVa%dW1cskRd=i)r?^PKmb_q;FftckNGCQ`P} z5NxBf@6))$@a+i0r*AI|8p1hs^ZL1_{wIHI@9&&0&g;F}x;S#8-%MIo9N0lUYB4s^ z3$+e&XhZ$zI+6u2-dLu?s~qW2e|KYTQ0ey27iSr+Q?4tD!_?p^#Y^T5*CbjtfsSkY z)$Fm|0KGL{y|%aBYpg1c8gDJNYYjL`6IxMGF*7R%U~RBkc{HUwn5ZtDiYjk7+Re)C zXSAi2VgVg;A6B2InO7CN6vve_*{>Y{%j*<(AKev~21{@QF3K#23@52!d;=S%+qp@wr|X}R)E9;b5e9-kvX05zcU*MVE-Y$ zFf7SI!t=r)FJY`P=SrRali6bd7+7b%B@DlqFH2tE5-s@=;c4Lv)Exr6d5YfooR;)F zE&2j@fgJu4Owuv`dDQ5!KZb3z;BQ1bRg_nXbEQ1v&y;r+&dM+s^8l>jkPeXo`+ZA< zR{VRZBcNg>-3uJV1dRrsqE1c0syxpevnC^1Td6-7z?U==jN!cL2z|&m(PDZLM7$S_u&@k(w!**%(`)ttm|Ibu3SK3nleo%_qMP(+*AZ(5{mt_3~ DNCW3a delta 614 zcmYLHT}YE*6n@XPQfKyU`E7kG%t;?P-M8(liwr{cIUJiHI;4ys=m(kISaw%Ixto#U zMaj9bAQ}ddQS|d&L{K@gi``Whfj^YlAG4d*%taS@I@6$Yah~^i&Uv5ryl`erm@(n% z>#BZTtVRFm?4;B1uM@*>tQ`!TfO&ax@7u38dOpv+xOM43^~&>a7c1IKbIkJH2^Vl! z-U)~Cp4 zxL?_!)IHuJL{V}cQR7L;H$Bp>8u5`%HGu<47u?5ZN}au)h@|=yd*XuPhHF?-nqVV( zJ$p8d7l?|Iqg{xot7T+cG*7|S8AgMjlrIPZ5^#3B~`K^Vjatr6~EhjtWvIHAQL zg+H`z@M2q_g_U!GvOXIas?GB-C+tGlRNWAf&5kRU2$Hr9(?J=+cs;laGB_2y2?;zN z3fuL0B85k?9m2bzAmng16oZ1LlEpL3%F~oXH6QghTK@umVcM4f diff --git a/deps/convert.cpp b/deps/convert.cpp index a42bba25..fb2b0397 100644 --- a/deps/convert.cpp +++ b/deps/convert.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -127,6 +128,9 @@ int read_mtx_matrix(const char* filename, --irow; --icol; + // Take absolute matrix value to avoid rounding issues when testing + ival = std::abs(ival); + unsorted_row[idx] = irow; unsorted_col[idx] = icol; unsorted_val[idx] = ival; From b359eb324230562650950d38b5dfd8f1c176d737 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 10:43:43 +0200 Subject: [PATCH 192/304] added sanity check for ELL width when partition type == max --- library/src/conversion/rocsparse_csr2hyb.hpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/library/src/conversion/rocsparse_csr2hyb.hpp b/library/src/conversion/rocsparse_csr2hyb.hpp index 2c33d088..64c63151 100644 --- a/library/src/conversion/rocsparse_csr2hyb.hpp +++ b/library/src/conversion/rocsparse_csr2hyb.hpp @@ -111,6 +111,9 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, // Correct by index base csr_nnz -= descr->base; + // Maximum ELL row width allowed + rocsparse_int max_row_nnz = (2 * csr_nnz - 1) / m + 1; + // Check user_ell_width if(partition_type == rocsparse_hyb_partition_user) { @@ -120,7 +123,6 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, return rocsparse_status_invalid_value; } - rocsparse_int max_row_nnz = (2 * csr_nnz - 1) / m + 1; if(user_ell_width > max_row_nnz) { return rocsparse_status_invalid_value; @@ -202,6 +204,13 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hipMemcpy(&hyb->ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); } + // Re-check ELL width + if(hyb->ell_width > max_row_nnz) + { + RETURN_IF_HIP_ERROR(hipFree(workspace)); + return rocsparse_status_invalid_value; + } + // Compute ELL non-zeros hyb->ell_nnz = hyb->ell_width * m; From 997721bbafa31710e7b71012083839cfa006a66f Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 10:44:03 +0200 Subject: [PATCH 193/304] real matrix testing for coomv --- clients/include/testing_coomv.hpp | 48 +++++++++++++++---- clients/tests/test_coomv.cpp | 80 +++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 9 deletions(-) diff --git a/clients/include/testing_coomv.hpp b/clients/include/testing_coomv.hpp index c2bd8b17..424baf91 100644 --- a/clients/include/testing_coomv.hpp +++ b/clients/include/testing_coomv.hpp @@ -139,8 +139,23 @@ rocsparse_status testing_coomv(Arguments argus) T h_beta = argus.beta; rocsparse_operation trans = argus.trans; rocsparse_index_base idx_base = argus.idx_base; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + std::unique_ptr test_handle(new handle_struct); rocsparse_handle handle = test_handle->handle; @@ -206,7 +221,25 @@ rocsparse_status testing_coomv(Arguments argus) // Initial Data on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + if(read_bin_matrix(binfile.c_str(), m, n, nnz, hptr, hcol, hval, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + + // Convert CSR to COO + hrow.resize(nnz); + for(rocsparse_int i = 0; i < m; ++i) + { + for(rocsparse_int j = hptr[i]; j < hptr[i + 1]; ++j) + { + hrow[j - idx_base] = i + idx_base; + } + } + } + else if(argus.laplacian) { m = n = gen_2d_laplacian(argus.laplacian, hptr, hcol, hval, idx_base); nnz = hptr[m]; @@ -223,11 +256,11 @@ rocsparse_status testing_coomv(Arguments argus) } else { - if(argus.filename != "") + if(filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hrow, hcol, hval, idx_base) != 0) + if(read_mtx_matrix(filename.c_str(), m, n, nnz, hrow, hcol, hval, idx_base) != 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } @@ -324,11 +357,8 @@ rocsparse_status testing_coomv(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - if(argus.unit_check) - { - unit_check_general(1, m, hy_gold.data(), hy_1.data()); - unit_check_general(1, m, hy_gold.data(), hy_2.data()); - } + unit_check_near(1, m, 1, hy_gold.data(), hy_1.data()); + unit_check_near(1, m, 1, hy_gold.data(), hy_2.data()); } if(argus.timing) diff --git a/clients/tests/test_coomv.cpp b/clients/tests/test_coomv.cpp index c604ad2c..6f2634ae 100644 --- a/clients/tests/test_coomv.cpp +++ b/clients/tests/test_coomv.cpp @@ -8,9 +8,11 @@ #include #include #include +#include typedef rocsparse_index_base base; typedef std::tuple coomv_tuple; +typedef std::tuple coomv_bin_tuple; int coo_M_range[] = {-1, 0, 10, 500, 7111, 10000}; int coo_N_range[] = {-3, 0, 33, 842, 4441, 10000}; @@ -20,6 +22,21 @@ std::vector coo_beta_range = {0.0, 0.67, 1.0}; base coo_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +std::string coo_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_coomv : public testing::TestWithParam { protected: @@ -29,6 +46,15 @@ class parameterized_coomv : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_coomv_bin : public testing::TestWithParam +{ + protected: + parameterized_coomv_bin() {} + virtual ~parameterized_coomv_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_coomv_arguments(coomv_tuple tup) { Arguments arg; @@ -41,6 +67,37 @@ Arguments setup_coomv_arguments(coomv_tuple tup) return arg; } +Arguments setup_coomv_arguments(coomv_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.alpha = std::get<0>(tup); + arg.beta = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<3>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + TEST(coomv_bad_arg, coomv_float) { testing_coomv_bad_arg(); } TEST_P(parameterized_coomv, coomv_float) @@ -59,6 +116,22 @@ TEST_P(parameterized_coomv, coomv_double) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_coomv_bin, coomv_bin_float) +{ + Arguments arg = setup_coomv_arguments(GetParam()); + + rocsparse_status status = testing_coomv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_coomv_bin, coomv_bin_double) +{ + Arguments arg = setup_coomv_arguments(GetParam()); + + rocsparse_status status = testing_coomv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(coomv, parameterized_coomv, testing::Combine(testing::ValuesIn(coo_M_range), @@ -66,3 +139,10 @@ INSTANTIATE_TEST_CASE_P(coomv, testing::ValuesIn(coo_alpha_range), testing::ValuesIn(coo_beta_range), testing::ValuesIn(coo_idxbase_range))); + +INSTANTIATE_TEST_CASE_P(coomv_bin, + parameterized_coomv_bin, + testing::Combine(testing::ValuesIn(coo_alpha_range), + testing::ValuesIn(coo_beta_range), + testing::ValuesIn(coo_idxbase_range), + testing::ValuesIn(coo_bin))); From 28a3643aa5f9c29e82af06df5ec3da1d72f9aa4d Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 10:44:23 +0200 Subject: [PATCH 194/304] real matrix testing for hybmv and csr2hyb --- clients/include/testing_csr2hyb.hpp | 49 ++++++++-- clients/include/testing_hybmv.hpp | 140 +++++++++++++++++++++++++--- clients/tests/test_csr2hyb.cpp | 1 + clients/tests/test_hybmv.cpp | 85 +++++++++++++++++ 4 files changed, 251 insertions(+), 24 deletions(-) diff --git a/clients/include/testing_csr2hyb.hpp b/clients/include/testing_csr2hyb.hpp index 57ce2043..c6db680f 100644 --- a/clients/include/testing_csr2hyb.hpp +++ b/clients/include/testing_csr2hyb.hpp @@ -328,6 +328,35 @@ rocsparse_status testing_csr2hyb(Arguments argus) } } + // Max width check + if(part == rocsparse_hyb_partition_max) + { + // Compute max ELL width + rocsparse_int ell_max_width = 0; + for(rocsparse_int i = 0; i < m; ++i) + { + ell_max_width = std::max(hcsr_row_ptr[i + 1] - hcsr_row_ptr[i], ell_max_width); + } + + rocsparse_int width_limit = (2 * nnz - 1) / m + 1; + if(ell_max_width > width_limit) + { + status = rocsparse_csr2hyb(handle, + m, + n, + descr, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + hyb, + user_ell_width, + part); + + verify_rocsparse_status_invalid_value(status, "ell_max_width > width_limit"); + return rocsparse_status_success; + } + } + // Host structures for verification std::vector hhyb_ell_col_ind_gold; std::vector hhyb_ell_val_gold; @@ -430,11 +459,11 @@ rocsparse_status testing_csr2hyb(Arguments argus) test_hyb* dhyb = (test_hyb*)hyb; // Check if sizes match - unit_check_general(1, 1, &m, &dhyb->m); - unit_check_general(1, 1, &n, &dhyb->n); - unit_check_general(1, 1, &ell_width, &dhyb->ell_width); - unit_check_general(1, 1, &ell_nnz, &dhyb->ell_nnz); - unit_check_general(1, 1, &coo_nnz, &dhyb->coo_nnz); + unit_check_general(1, 1, 1, &m, &dhyb->m); + unit_check_general(1, 1, 1, &n, &dhyb->n); + unit_check_general(1, 1, 1, &ell_width, &dhyb->ell_width); + unit_check_general(1, 1, 1, &ell_nnz, &dhyb->ell_nnz); + unit_check_general(1, 1, 1, &coo_nnz, &dhyb->coo_nnz); CHECK_HIP_ERROR(hipMemcpy(hhyb_ell_col_ind.data(), dhyb->ell_col_ind, @@ -454,11 +483,11 @@ rocsparse_status testing_csr2hyb(Arguments argus) hhyb_coo_val.data(), dhyb->coo_val, sizeof(T) * coo_nnz, hipMemcpyDeviceToHost)); // Unit check - unit_check_general(1, ell_nnz, hhyb_ell_col_ind_gold.data(), hhyb_ell_col_ind.data()); - unit_check_general(1, ell_nnz, hhyb_ell_val_gold.data(), hhyb_ell_val.data()); - unit_check_general(1, coo_nnz, hhyb_coo_row_ind_gold.data(), hhyb_coo_row_ind.data()); - unit_check_general(1, coo_nnz, hhyb_coo_col_ind_gold.data(), hhyb_coo_col_ind.data()); - unit_check_general(1, coo_nnz, hhyb_coo_val_gold.data(), hhyb_coo_val.data()); + unit_check_general(1, ell_nnz, 1, hhyb_ell_col_ind_gold.data(), hhyb_ell_col_ind.data()); + unit_check_general(1, ell_nnz, 1, hhyb_ell_val_gold.data(), hhyb_ell_val.data()); + unit_check_general(1, coo_nnz, 1, hhyb_coo_row_ind_gold.data(), hhyb_coo_row_ind.data()); + unit_check_general(1, coo_nnz, 1, hhyb_coo_col_ind_gold.data(), hhyb_coo_col_ind.data()); + unit_check_general(1, coo_nnz, 1, hhyb_coo_val_gold.data(), hhyb_coo_val.data()); } if(argus.timing) diff --git a/clients/include/testing_hybmv.hpp b/clients/include/testing_hybmv.hpp index 879cf969..158d0fad 100644 --- a/clients/include/testing_hybmv.hpp +++ b/clients/include/testing_hybmv.hpp @@ -17,6 +17,10 @@ using namespace rocsparse; using namespace rocsparse_test; +#define ELL_IND_ROW(i, el, m, width) (el) * (m) + (i) +#define ELL_IND_EL(i, el, m, width) (el) + (width) * (i) +#define ELL_IND(i, el, m, width) ELL_IND_ROW(i, el, m, width) + struct testhyb { rocsparse_int m; @@ -125,8 +129,23 @@ rocsparse_status testing_hybmv(Arguments argus) rocsparse_index_base idx_base = argus.idx_base; rocsparse_hyb_partition part = argus.part; rocsparse_int user_ell_width = argus.ell_width; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + std::unique_ptr test_handle(new handle_struct); rocsparse_handle handle = test_handle->handle; @@ -196,19 +215,27 @@ rocsparse_status testing_hybmv(Arguments argus) // Initial Data on CPU srand(12345ULL); - if(argus.laplacian) + if(binfile != "") + { + if(read_bin_matrix(binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcol_ind, hval, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) { m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcol_ind, hval, idx_base); nnz = hcsr_row_ptr[m]; } else { - if(argus.filename != "") + if(filename != "") { if(read_mtx_matrix( - argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != 0) + filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base) != 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } @@ -283,18 +310,62 @@ rocsparse_status testing_hybmv(Arguments argus) CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); - // User given ELL width + // ELL width limit + rocsparse_int width_limit = (2 * nnz - 1) / m + 1; + + // Limit ELL user width if(part == rocsparse_hyb_partition_user) { user_ell_width = user_ell_width * nnz / m; + user_ell_width = std::min(width_limit, user_ell_width); } // Convert CSR to HYB - CHECK_ROCSPARSE_ERROR( - rocsparse_csr2hyb(handle, m, n, descr, dval, dptr, dcol, hyb, user_ell_width, part)); + status = rocsparse_csr2hyb(handle, m, n, descr, dval, dptr, dcol, hyb, user_ell_width, part); + + if(part == rocsparse_hyb_partition_max) + { + // Compute max ELL width + rocsparse_int ell_max_width = 0; + for(rocsparse_int i = 0; i < m; ++i) + { + ell_max_width = std::max(hcsr_row_ptr[i + 1] - hcsr_row_ptr[i], ell_max_width); + } + + if(ell_max_width > width_limit) + { + verify_rocsparse_status_invalid_value(status, "ell_max_width > width_limit"); + return rocsparse_status_success; + } + } if(argus.unit_check) { + // Copy HYB structure to CPU + testhyb* dhyb = (testhyb*)hyb; + + rocsparse_int ell_nnz = dhyb->ell_nnz; + rocsparse_int coo_nnz = dhyb->coo_nnz; + + std::vector hell_col(ell_nnz); + std::vector hell_val(ell_nnz); + std::vector hcoo_row(coo_nnz); + std::vector hcoo_col(coo_nnz); + std::vector hcoo_val(coo_nnz); + + if(ell_nnz > 0) + { + CHECK_HIP_ERROR(hipMemcpy(hell_col.data(), dhyb->ell_col_ind, sizeof(rocsparse_int) * ell_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hell_val.data(), dhyb->ell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); + } + + if(coo_nnz > 0) + { + CHECK_HIP_ERROR(hipMemcpy(hcoo_row.data(), dhyb->coo_row_ind, sizeof(rocsparse_int) * coo_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcoo_col.data(), dhyb->coo_col_ind, sizeof(rocsparse_int) * coo_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcoo_val.data(), dhyb->coo_val, sizeof(T) * coo_nnz, hipMemcpyDeviceToHost)); + } + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * m, hipMemcpyHostToDevice)); // ROCSPARSE pointer mode host @@ -314,20 +385,61 @@ rocsparse_status testing_hybmv(Arguments argus) // CPU double cpu_time_used = get_time_us(); - for(rocsparse_int i = 0; i < m; ++i) + // ELL part + if(ell_nnz > 0) + { + for(rocsparse_int i = 0; i < m; ++i) + { + T sum = static_cast(0); + for(rocsparse_int p = 0; p < dhyb->ell_width; ++p) + { + rocsparse_int idx = ELL_IND(i, p, m, dhyb->ell_width); + rocsparse_int col = hell_col[idx] - idx_base; + + if(col >= 0 && col < n) + { + sum += hell_val[idx] * hx[col]; + } + else + { + break; + } + } + + if(h_beta != static_cast(0)) + { + hy_gold[i] = h_beta * hy_gold[i] + h_alpha * sum; + } + else + { + hy_gold[i] = h_alpha * sum; + } + } + } + + // COO part + if(coo_nnz > 0) { - hy_gold[i] *= h_beta; - for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; - ++j) + T coo_beta = (ell_nnz > 0) ? static_cast(1) : h_beta; + + for(rocsparse_int i = 0; i < m; ++i) { - hy_gold[i] += h_alpha * hval[j] * hx[hcol_ind[j] - idx_base]; + hy_gold[i] *= coo_beta; + } + + for(rocsparse_int i = 0; i < coo_nnz; ++i) + { + rocsparse_int row = hcoo_row[i] - idx_base; + rocsparse_int col = hcoo_col[i] - idx_base; + + hy_gold[row] += h_alpha * hcoo_val[i] * hx[col]; } } cpu_time_used = get_time_us() - cpu_time_used; - unit_check_general(1, m, hy_gold.data(), hy_1.data()); - unit_check_general(1, m, hy_gold.data(), hy_2.data()); + unit_check_near(1, m, 1, hy_gold.data(), hy_1.data()); + unit_check_near(1, m, 1, hy_gold.data(), hy_2.data()); } if(argus.timing) diff --git a/clients/tests/test_csr2hyb.cpp b/clients/tests/test_csr2hyb.cpp index 3b435c7a..fd7d89d7 100644 --- a/clients/tests/test_csr2hyb.cpp +++ b/clients/tests/test_csr2hyb.cpp @@ -30,6 +30,7 @@ std::string csr2hyb_bin[] = {"rma10.bin", "bibd_22_8.bin", "mc2depi.bin", "scircuit.bin", + "ASIC_320k.bin", "bmwcra_1.bin", "nos1.bin", "nos2.bin", diff --git a/clients/tests/test_hybmv.cpp b/clients/tests/test_hybmv.cpp index cecf7747..c77f051a 100644 --- a/clients/tests/test_hybmv.cpp +++ b/clients/tests/test_hybmv.cpp @@ -8,9 +8,12 @@ #include #include #include +#include typedef std::tuple hybmv_tuple; +typedef std::tuple + hybmv_bin_tuple; int hyb_M_range[] = {-1, 0, 10, 500, 7111, 10000}; int hyb_N_range[] = {-3, 0, 33, 842, 4441, 10000}; @@ -25,6 +28,21 @@ rocsparse_hyb_partition hyb_partition[] = { int hyb_ELL_range[] = {0, 1, 2}; +std::string hyb_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + class parameterized_hybmv : public testing::TestWithParam { protected: @@ -34,6 +52,15 @@ class parameterized_hybmv : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_hybmv_bin : public testing::TestWithParam +{ + protected: + parameterized_hybmv_bin() {} + virtual ~parameterized_hybmv_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_hybmv_arguments(hybmv_tuple tup) { Arguments arg; @@ -48,6 +75,39 @@ Arguments setup_hybmv_arguments(hybmv_tuple tup) return arg; } +Arguments setup_hybmv_arguments(hybmv_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.alpha = std::get<0>(tup); + arg.beta = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.part = std::get<3>(tup); + arg.ell_width = std::get<4>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<5>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + TEST(hybmv_bad_arg, hybmv_float) { testing_hybmv_bad_arg(); } TEST_P(parameterized_hybmv, hybmv_float) @@ -66,6 +126,22 @@ TEST_P(parameterized_hybmv, hybmv_double) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_hybmv_bin, hybmv_bin_float) +{ + Arguments arg = setup_hybmv_arguments(GetParam()); + + rocsparse_status status = testing_hybmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_hybmv_bin, hybmv_bin_double) +{ + Arguments arg = setup_hybmv_arguments(GetParam()); + + rocsparse_status status = testing_hybmv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(hybmv, parameterized_hybmv, testing::Combine(testing::ValuesIn(hyb_M_range), @@ -75,3 +151,12 @@ INSTANTIATE_TEST_CASE_P(hybmv, testing::ValuesIn(hyb_idxbase_range), testing::ValuesIn(hyb_partition), testing::ValuesIn(hyb_ELL_range))); + +INSTANTIATE_TEST_CASE_P(hybmv_bin, + parameterized_hybmv_bin, + testing::Combine(testing::ValuesIn(hyb_alpha_range), + testing::ValuesIn(hyb_beta_range), + testing::ValuesIn(hyb_idxbase_range), + testing::ValuesIn(hyb_partition), + testing::ValuesIn(hyb_ELL_range), + testing::ValuesIn(hyb_bin))); From d852d8b5cb2995b8f796c5845248e5e8122808e1 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 10:55:48 +0200 Subject: [PATCH 195/304] added missing header in unit.cpp --- clients/common/unit.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index 87965860..1d5bdd54 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #ifdef GOOGLE_TEST From 4ee8768e1bb4cbfab5866e58f10f605b1902dca9 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 11:38:53 +0200 Subject: [PATCH 196/304] csr2hyb perf optimization --- library/src/conversion/csr2hyb_device.h | 89 +++----------------- library/src/conversion/rocsparse_csr2hyb.hpp | 64 +++++++------- 2 files changed, 42 insertions(+), 111 deletions(-) diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index 494199bb..c536ddbb 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -10,101 +10,34 @@ #include -// Block reduce kernel computing sum +// Compute non-zero entries per CSR row to obtain the COO nnz per row. template -__device__ void sum_reduce(rocsparse_int tid, rocsparse_int* data) +__global__ void hyb_coo_nnz(rocsparse_int m, + rocsparse_int ell_width, + const rocsparse_int* csr_row_ptr, + rocsparse_int* coo_row_nnz, + rocsparse_index_base idx_base) { - __syncthreads(); - - for(rocsparse_int i = NB >> 1; i > 0; i >>= 1) - { - if(tid < i) - { - data[tid] += data[tid + i]; - } - - __syncthreads(); - } -} - -// Compute non-zero entries per CSR row and do a block reduction over the sum -// to obtain the number of COO part non-zero entries and COO nnz per row. -// Store the result in a workspace for final reduction on part2 -template -__global__ void hyb_coo_nnz_part1(rocsparse_int m, - rocsparse_int ell_width, - const rocsparse_int* csr_row_ptr, - rocsparse_int* workspace, - rocsparse_int* coo_row_nnz) -{ - rocsparse_int tid = hipThreadIdx_x; rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - __shared__ rocsparse_int sdata[NB]; - if(gid < m) { rocsparse_int row_nnz = csr_row_ptr[gid + 1] - csr_row_ptr[gid]; if(row_nnz > ell_width) { - row_nnz = row_nnz - ell_width; - sdata[tid] = row_nnz; - coo_row_nnz[gid] = row_nnz; + row_nnz = row_nnz - ell_width; + coo_row_nnz[gid + 1] = row_nnz; } else { - sdata[tid] = 0; - coo_row_nnz[gid] = 0; + coo_row_nnz[gid + 1] = 0; } } - else - { - sdata[tid] = 0; - } - - sum_reduce(tid, sdata); - - if(tid == 0) - { - workspace[hipBlockIdx_x] = sdata[0]; - } -} - -// Part2 kernel for final reduction over the sum of COO non-zero entries -template -__global__ void hyb_coo_nnz_part2(rocsparse_int m, rocsparse_int* workspace) -{ - rocsparse_int tid = hipThreadIdx_x; - - __shared__ rocsparse_int sdata[NB]; - sdata[tid] = 0; - - for(rocsparse_int i = tid; i < m; i += NB) - { - sdata[tid] += workspace[i]; - } - - __syncthreads(); - - if(m < 32) - { - if(tid == 0) - { - for(rocsparse_int i = 1; i < m; ++i) - { - sdata[0] += sdata[i]; - } - } - } - else - { - sum_reduce(tid, sdata); - } - if(tid == 0) + if(gid == 0) { - workspace[0] = sdata[0]; + coo_row_nnz[0] = idx_base; } } diff --git a/library/src/conversion/rocsparse_csr2hyb.hpp b/library/src/conversion/rocsparse_csr2hyb.hpp index 64c63151..6bedb47d 100644 --- a/library/src/conversion/rocsparse_csr2hyb.hpp +++ b/library/src/conversion/rocsparse_csr2hyb.hpp @@ -14,6 +14,7 @@ #include "csr2ell_device.h" #include +#include template rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, @@ -166,9 +167,6 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, #define CSR2ELL_DIM 512 // Workspace size rocsparse_int blocks = (m - 1) / CSR2ELL_DIM + 1; - // Allocate workspace - rocsparse_int* workspace = NULL; - RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(rocsparse_int) * blocks)); if(partition_type == rocsparse_hyb_partition_user) { @@ -182,6 +180,10 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, } else { + // Allocate workspace + rocsparse_int* workspace = nullptr; + RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(rocsparse_int) * blocks)); + // HYB == ELL - no COO part - compute maximum nnz per row hipLaunchKernelGGL((ell_width_kernel_part1), dim3(blocks), @@ -202,12 +204,13 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, // Copy ell width back to host RETURN_IF_HIP_ERROR( hipMemcpy(&hyb->ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + RETURN_IF_HIP_ERROR(hipFree(workspace)); } // Re-check ELL width if(hyb->ell_width > max_row_nnz) { - RETURN_IF_HIP_ERROR(hipFree(workspace)); return rocsparse_status_invalid_value; } @@ -222,9 +225,9 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, RETURN_IF_HIP_ERROR(hipMalloc(&hyb->ell_val, sizeof(T) * hyb->ell_nnz)); } - // Allocate workspace2 - rocsparse_int* workspace2 = NULL; - RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace2, sizeof(rocsparse_int) * (m + 1))); + // Allocate workspace + rocsparse_int* workspace = NULL; + RETURN_IF_HIP_ERROR(hipMalloc((void**)&workspace, sizeof(rocsparse_int) * (m + 1))); // If there is a COO part, compute the COO non-zero elements per row if(partition_type != rocsparse_hyb_partition_max) @@ -234,11 +237,11 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, { hyb->coo_nnz = csr_nnz; RETURN_IF_HIP_ERROR(hipMemcpy( - workspace2, csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToDevice)); + workspace, csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToDevice)); } else { - hipLaunchKernelGGL((hyb_coo_nnz_part1), + hipLaunchKernelGGL((hyb_coo_nnz), dim3((m - 1) / CSR2ELL_DIM + 1), dim3(CSR2ELL_DIM), 0, @@ -247,37 +250,32 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hyb->ell_width, csr_row_ptr, workspace, - workspace2); + descr->base); - hipLaunchKernelGGL((hyb_coo_nnz_part2), - dim3(1), - dim3(CSR2ELL_DIM), - 0, - stream, - blocks, - workspace); + // Inclusive sum on workspace + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; - RETURN_IF_HIP_ERROR( - hipMemcpy(&hyb->coo_nnz, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + // Obtain hipcub buffer size + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, workspace, workspace, m + 1)); - // Perform exclusive scan on workspace TODO use rocPRIM - std::vector hbuf(m + 1); - RETURN_IF_HIP_ERROR(hipMemcpy( - hbuf.data() + 1, workspace2, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); + // Allocate hipcub buffer + RETURN_IF_HIP_ERROR(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Do inclusive sum + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, workspace, workspace, m + 1)); - hbuf[0] = descr->base; - for(rocsparse_int i = 0; i < m; ++i) - { - hbuf[i + 1] += hbuf[i]; - } + // Clear hipcub buffer + RETURN_IF_HIP_ERROR(hipFree(d_temp_storage)); + // Obtain coo nnz from workspace RETURN_IF_HIP_ERROR(hipMemcpy( - workspace2, hbuf.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + &hyb->coo_nnz, workspace + m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + hyb->coo_nnz -= descr->base; } } - RETURN_IF_HIP_ERROR(hipFree(workspace)); - // Allocate COO part if(hyb->coo_nnz > 0) { @@ -306,10 +304,10 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, hyb->coo_row_ind, hyb->coo_col_ind, (T*)hyb->coo_val, - workspace2, + workspace, descr->base); - RETURN_IF_HIP_ERROR(hipFree(workspace2)); + RETURN_IF_HIP_ERROR(hipFree(workspace)); #undef CSR2ELL_DIM return rocsparse_status_success; From 28f62ac7caa6c858eefc055c248ccd6a52efadcb Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 11:42:34 +0200 Subject: [PATCH 197/304] clang-format --- clients/common/unit.cpp | 21 ++++++++++------ clients/include/testing_hybmv.hpp | 25 ++++++++++++++------ library/src/conversion/rocsparse_csr2hyb.hpp | 8 ++++--- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index 1d5bdd54..de6149ed 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -30,7 +30,8 @@ // a wrapper will cause the loop keep going template <> -void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, float* hCPU, float* hGPU) +void unit_check_general( + rocsparse_int M, rocsparse_int N, rocsparse_int lda, float* hCPU, float* hGPU) { for(rocsparse_int j = 0; j < N; j++) { @@ -46,7 +47,8 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, flo } template <> -void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, double* hCPU, double* hGPU) +void unit_check_general( + rocsparse_int M, rocsparse_int N, rocsparse_int lda, double* hCPU, double* hGPU) { for(rocsparse_int j = 0; j < N; j++) { @@ -62,7 +64,8 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, dou } template <> -void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, rocsparse_int* hCPU, rocsparse_int* hGPU) +void unit_check_general( + rocsparse_int M, rocsparse_int N, rocsparse_int lda, rocsparse_int* hCPU, rocsparse_int* hGPU) { for(rocsparse_int j = 0; j < N; j++) { @@ -78,7 +81,8 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, roc } template <> -void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, size_t* hCPU, size_t* hGPU) +void unit_check_general( + rocsparse_int M, rocsparse_int N, rocsparse_int lda, size_t* hCPU, size_t* hGPU) { for(rocsparse_int j = 0; j < N; j++) { @@ -105,7 +109,8 @@ void unit_check_near(rocsparse_int M, rocsparse_int N, rocsparse_int lda, float* { for(rocsparse_int i = 0; i < M; i++) { - float compare_val = std::max(std::abs(hCPU[i + j * lda] * 1e-3f), 10 * std::numeric_limits::epsilon()); + float compare_val = std::max(std::abs(hCPU[i + j * lda] * 1e-3f), + 10 * std::numeric_limits::epsilon()); #ifdef GOOGLE_TEST ASSERT_NEAR(hCPU[i + j * lda], hGPU[i + j * lda], compare_val); #else @@ -116,13 +121,15 @@ void unit_check_near(rocsparse_int M, rocsparse_int N, rocsparse_int lda, float* } template <> -void unit_check_near(rocsparse_int M, rocsparse_int N, rocsparse_int lda, double* hCPU, double* hGPU) +void unit_check_near( + rocsparse_int M, rocsparse_int N, rocsparse_int lda, double* hCPU, double* hGPU) { for(rocsparse_int j = 0; j < N; j++) { for(rocsparse_int i = 0; i < M; i++) { - double compare_val = std::max(std::abs(hCPU[i + j * lda] * 1e-12), 10 * std::numeric_limits::epsilon()); + double compare_val = std::max(std::abs(hCPU[i + j * lda] * 1e-12), + 10 * std::numeric_limits::epsilon()); #ifdef GOOGLE_TEST ASSERT_NEAR(hCPU[i + j * lda], hGPU[i + j * lda], compare_val); #else diff --git a/clients/include/testing_hybmv.hpp b/clients/include/testing_hybmv.hpp index 158d0fad..51d7d94e 100644 --- a/clients/include/testing_hybmv.hpp +++ b/clients/include/testing_hybmv.hpp @@ -355,15 +355,26 @@ rocsparse_status testing_hybmv(Arguments argus) if(ell_nnz > 0) { - CHECK_HIP_ERROR(hipMemcpy(hell_col.data(), dhyb->ell_col_ind, sizeof(rocsparse_int) * ell_nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hell_val.data(), dhyb->ell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hell_col.data(), + dhyb->ell_col_ind, + sizeof(rocsparse_int) * ell_nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + hell_val.data(), dhyb->ell_val, sizeof(T) * ell_nnz, hipMemcpyDeviceToHost)); } if(coo_nnz > 0) { - CHECK_HIP_ERROR(hipMemcpy(hcoo_row.data(), dhyb->coo_row_ind, sizeof(rocsparse_int) * coo_nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hcoo_col.data(), dhyb->coo_col_ind, sizeof(rocsparse_int) * coo_nnz, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hcoo_val.data(), dhyb->coo_val, sizeof(T) * coo_nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcoo_row.data(), + dhyb->coo_row_ind, + sizeof(rocsparse_int) * coo_nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcoo_col.data(), + dhyb->coo_col_ind, + sizeof(rocsparse_int) * coo_nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + hcoo_val.data(), dhyb->coo_val, sizeof(T) * coo_nnz, hipMemcpyDeviceToHost)); } CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * m, hipMemcpyHostToDevice)); @@ -421,12 +432,12 @@ rocsparse_status testing_hybmv(Arguments argus) if(coo_nnz > 0) { T coo_beta = (ell_nnz > 0) ? static_cast(1) : h_beta; - + for(rocsparse_int i = 0; i < m; ++i) { hy_gold[i] *= coo_beta; } - + for(rocsparse_int i = 0; i < coo_nnz; ++i) { rocsparse_int row = hcoo_row[i] - idx_base; diff --git a/library/src/conversion/rocsparse_csr2hyb.hpp b/library/src/conversion/rocsparse_csr2hyb.hpp index 6bedb47d..c8a51b03 100644 --- a/library/src/conversion/rocsparse_csr2hyb.hpp +++ b/library/src/conversion/rocsparse_csr2hyb.hpp @@ -253,17 +253,19 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, descr->base); // Inclusive sum on workspace - void* d_temp_storage = nullptr; + void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; // Obtain hipcub buffer size - RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, workspace, workspace, m + 1)); + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum( + d_temp_storage, temp_storage_bytes, workspace, workspace, m + 1)); // Allocate hipcub buffer RETURN_IF_HIP_ERROR(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Do inclusive sum - RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, workspace, workspace, m + 1)); + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum( + d_temp_storage, temp_storage_bytes, workspace, workspace, m + 1)); // Clear hipcub buffer RETURN_IF_HIP_ERROR(hipFree(d_temp_storage)); From f1ea2077b09720d81f5f66d1f86762ded06ded09 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 12:01:10 +0200 Subject: [PATCH 198/304] adjusting transA paramter --- clients/include/testing_coosort.hpp | 2 +- clients/tests/test_coosort.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clients/include/testing_coosort.hpp b/clients/include/testing_coosort.hpp index 9814b870..76b0f3bc 100644 --- a/clients/include/testing_coosort.hpp +++ b/clients/include/testing_coosort.hpp @@ -171,7 +171,7 @@ rocsparse_status testing_coosort(Arguments argus) rocsparse_int m = argus.M; rocsparse_int n = argus.N; rocsparse_int safe_size = 100; - rocsparse_int by_row = argus.trans == rocsparse_operation_none; + rocsparse_int by_row = argus.transA == rocsparse_operation_none; rocsparse_int permute = argus.temp; rocsparse_index_base idx_base = argus.idx_base; std::string binfile = ""; diff --git a/clients/tests/test_coosort.cpp b/clients/tests/test_coosort.cpp index 3027b7d6..040dcfa8 100644 --- a/clients/tests/test_coosort.cpp +++ b/clients/tests/test_coosort.cpp @@ -57,7 +57,7 @@ Arguments setup_coosort_arguments(coosort_tuple tup) Arguments arg; arg.M = std::get<0>(tup); arg.N = std::get<1>(tup); - arg.trans = std::get<2>(tup); + arg.transA = std::get<2>(tup); arg.temp = std::get<3>(tup); arg.idx_base = std::get<4>(tup); arg.timing = 0; @@ -69,7 +69,7 @@ Arguments setup_coosort_arguments(coosort_bin_tuple tup) Arguments arg; arg.M = -99; arg.N = -99; - arg.trans = std::get<0>(tup); + arg.transA = std::get<0>(tup); arg.temp = std::get<1>(tup); arg.idx_base = std::get<2>(tup); arg.timing = 0; From 2592c043b45ea140d2e6f08b5d72e7bd41b87b49 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 15:59:00 +0200 Subject: [PATCH 199/304] csrmm for op(B) = B and op(B) = transposed(B) --- library/include/rocsparse-functions.h | 16 +- library/src/level3/csrmm_device.h | 151 ++++- library/src/level3/rocsparse_csrmm.cpp | 10 +- library/src/level3/rocsparse_csrmm.hpp | 763 ++++++++++++++++++++++--- 4 files changed, 847 insertions(+), 93 deletions(-) diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index a6189a28..05b114b9 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -817,7 +817,9 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, handle rocsparse_handle. handle to the rocsparse library context queue. @param[in] - trans operation type of A. + trans_A operation type of A. + @param[in] + trans_B operation type of B. @param[in] m number of rows of A. @param[in] @@ -853,7 +855,8 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, ********************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsrmm(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, @@ -871,7 +874,8 @@ rocsparse_status rocsparse_scsrmm(rocsparse_handle handle, ROCSPARSE_EXPORT rocsparse_status rocsparse_dcsrmm(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, @@ -889,7 +893,8 @@ rocsparse_status rocsparse_dcsrmm(rocsparse_handle handle, /* ROCSPARSE_EXPORT rocsparse_status rocsparse_ccsrmm(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, @@ -907,7 +912,8 @@ rocsparse_status rocsparse_ccsrmm(rocsparse_handle handle, ROCSPARSE_EXPORT rocsparse_status rocsparse_zcsrmm(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, diff --git a/library/src/level3/csrmm_device.h b/library/src/level3/csrmm_device.h index 26382766..8e554857 100644 --- a/library/src/level3/csrmm_device.h +++ b/library/src/level3/csrmm_device.h @@ -4,22 +4,143 @@ #include -template -static __device__ void csrmmn_general_device(rocsparse_int m, - rocsparse_int n, - rocsparse_int k, - rocsparse_int nnz, - T alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* B, - rocsparse_int ldb, - T beta, - T* C, - rocsparse_int ldc, - rocsparse_index_base idx_base) +template +static __device__ void csrmmnn_general_device(rocsparse_int M, + rocsparse_int N, + rocsparse_int K, + rocsparse_int nnz, + T alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* B, + rocsparse_int ldb, + T beta, + T* C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int warpid = gid / SUBWAVE_SIZE; + rocsparse_int laneid = gid & (SUBWAVE_SIZE - 1); + rocsparse_int subid = tid / SUBWAVE_SIZE; + rocsparse_int nwarps = hipGridDim_x * hipBlockDim_x / SUBWAVE_SIZE; + rocsparse_int col = laneid + hipBlockIdx_y * SUBWAVE_SIZE; + rocsparse_int colB = col * ldb; + rocsparse_int colC = col * ldc; + + __shared__ rocsparse_int shared_col[BLOCKSIZE/SUBWAVE_SIZE][SUBWAVE_SIZE]; + __shared__ T shared_val[BLOCKSIZE/SUBWAVE_SIZE][SUBWAVE_SIZE]; + + for(rocsparse_int row = warpid; row < M; row += nwarps) + { + rocsparse_int row_start = __ldg(csr_row_ptr + row) - idx_base; + rocsparse_int row_end = __ldg(csr_row_ptr + row + 1) - idx_base; + + T sum = static_cast(0); + + for(rocsparse_int j = row_start; j < row_end; j += SUBWAVE_SIZE) + { + rocsparse_int k = j + laneid; + + __syncthreads(); + + shared_col[subid][laneid] = (k < row_end) ? __ldg(csr_col_ind + k) - idx_base : 0; + shared_val[subid][laneid] = (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); + + __syncthreads(); + + for(rocsparse_int i = 0; i < SUBWAVE_SIZE && col < N; ++i) + { + sum += shared_val[subid][i] * __ldg(&B[shared_col[subid][i] + colB]); + } + } + + if(col < N) + { + if(beta == 0.0) + { + C[row + colC] = sum; + } + else + { + C[row + colC] = __ldg(&C[row + colC]) * beta + sum; + } + } + } +} + +template +static __device__ void csrmmnt_general_device(rocsparse_int offset, + rocsparse_int ncol, + rocsparse_int M, + rocsparse_int N, + rocsparse_int K, + rocsparse_int nnz, + T alpha, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + const T* csr_val, + const T* B, + rocsparse_int ldb, + T beta, + T* C, + rocsparse_int ldc, + rocsparse_index_base idx_base) +{ + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int row = gid / SUBWAVE_SIZE; + rocsparse_int laneid = tid & (SUBWAVE_SIZE - 1); + rocsparse_int subid = hipThreadIdx_x / SUBWAVE_SIZE; + + if(row >= M) + { + return; + } + + __shared__ rocsparse_int shared_col[BLOCKSIZE/SUBWAVE_SIZE][SUBWAVE_SIZE]; + __shared__ T shared_val[BLOCKSIZE/SUBWAVE_SIZE][SUBWAVE_SIZE]; + + rocsparse_int row_start = __ldg(csr_row_ptr + row) - idx_base; + rocsparse_int row_end = __ldg(csr_row_ptr + row + 1) - idx_base; + + for(rocsparse_int l = offset; l < ncol; l += SUBWAVE_SIZE) + { + rocsparse_int col = l + laneid; + T sum = static_cast(0); + + for(rocsparse_int j = row_start; j < row_end; j += SUBWAVE_SIZE) + { + rocsparse_int k = j + laneid; + + __syncthreads(); + + shared_col[subid][laneid] = (k < row_end) ? N * (__ldg(csr_col_ind + k) - idx_base) : 0; + shared_val[subid][laneid] = (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); + + __syncthreads(); + + for(rocsparse_int i = 0; i < SUBWAVE_SIZE; ++i) + { + T val_B = (col < ncol) ? __ldg(B + col + shared_col[subid][i]) : static_cast(0); + sum += shared_val[subid][i] * val_B; + } + } + + if(col < ncol) + { + if(beta == static_cast(0)) + { + C[row + col * ldc] = sum; + } + else + { + C[row + col * ldc] = beta * __ldg(C + row + col * ldc) + sum; + } + } + } } #endif // CSRMM_DEVICE_H diff --git a/library/src/level3/rocsparse_csrmm.cpp b/library/src/level3/rocsparse_csrmm.cpp index df780cc5..02d634bc 100644 --- a/library/src/level3/rocsparse_csrmm.cpp +++ b/library/src/level3/rocsparse_csrmm.cpp @@ -12,7 +12,8 @@ */ extern "C" rocsparse_status rocsparse_scsrmm(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, @@ -29,11 +30,12 @@ extern "C" rocsparse_status rocsparse_scsrmm(rocsparse_handle handle, rocsparse_int ldc) { return rocsparse_csrmm_template( - handle, trans, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); + handle, trans_A, trans_B, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); } extern "C" rocsparse_status rocsparse_dcsrmm(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, @@ -50,5 +52,5 @@ extern "C" rocsparse_status rocsparse_dcsrmm(rocsparse_handle handle, rocsparse_int ldc) { return rocsparse_csrmm_template( - handle, trans, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); + handle, trans_A, trans_B, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); } diff --git a/library/src/level3/rocsparse_csrmm.hpp b/library/src/level3/rocsparse_csrmm.hpp index 7df7797b..51f7b7af 100644 --- a/library/src/level3/rocsparse_csrmm.hpp +++ b/library/src/level3/rocsparse_csrmm.hpp @@ -13,47 +13,104 @@ #include -template -__global__ void csrmmn_kernel_host_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int k, - rocsparse_int nnz, - T alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* B, - rocsparse_int ldb, - T beta, - T* C, - rocsparse_int ldc, - rocsparse_index_base idx_base) +template +__launch_bounds__(256) +__global__ void csrmmnn_kernel_host_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, + rocsparse_int ldb, + T beta, + T* __restrict__ C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { - csrmmn_general_device(m, n, k, nnz, alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, beta, C, ldc, idx_base); + csrmmnn_general_device(m, n, k, nnz, alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, beta, C, ldc, idx_base); } -template -__global__ void csrmmn_kernel_device_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int k, - rocsparse_int nnz, - const T* alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* B, - rocsparse_int ldb, - const T* beta, - T* C, - rocsparse_int ldc, - rocsparse_index_base idx_base) +template +__launch_bounds__(256) +__global__ void csrmmnn_kernel_device_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, + rocsparse_int ldb, + const T* beta, + T* __restrict__ C, + rocsparse_int ldc, + rocsparse_index_base idx_base) +{ + if(*alpha == 0.0 && *beta == 1.0) + { + return; + } + + csrmmnn_general_device(m, n, k, nnz, *alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, *beta, C, ldc, idx_base); +} + +template +__launch_bounds__(256) +__global__ void csrmmnt_kernel_host_pointer(rocsparse_int offset, + rocsparse_int ncol, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, + rocsparse_int ldb, + T beta, + T* __restrict__ C, + rocsparse_int ldc, + rocsparse_index_base idx_base) +{ + csrmmnt_general_device(offset, ncol, m, n, k, nnz, alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, beta, C, ldc, idx_base); +} + +template +__launch_bounds__(256) +__global__ void csrmmnt_kernel_device_pointer(rocsparse_int offset, + rocsparse_int ncol, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, + rocsparse_int ldb, + const T* beta, + T* __restrict__ C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { - csrmmn_general_device(m, n, k, nnz, *alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, *beta, C, ldc, idx_base); + if(*alpha == 0.0 && *beta == 1.0) + { + return; + } + + csrmmnt_general_device(offset, ncol, m, n, k, nnz, *alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, *beta, C, ldc, idx_base); } template rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, @@ -84,7 +141,8 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, { log_trace(handle, replaceX("rocsparse_Xcsrmm"), - trans, + trans_A, + trans_B, m, n, k, @@ -104,7 +162,8 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, { log_trace(handle, replaceX("rocsparse_Xcsrmm"), - trans, + trans_A, + trans_B, m, n, k, @@ -150,30 +209,6 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, return rocsparse_status_invalid_size; } - // Check leading dimensions - if(trans == rocsparse_operation_none) - { - if(ldb < std::max(1, k)) - { - return rocsparse_status_invalid_size; - } - else if(ldc < std::max(1, m)) - { - return rocsparse_status_invalid_size; - } - } - else - { - if(ldb < std::max(1, m)) - { - return rocsparse_status_invalid_size; - } - else if(ldc < std::max(1, k)) - { - return rocsparse_status_invalid_size; - } - } - // Check pointer arguments if(csr_val == nullptr) { @@ -210,27 +245,617 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, return rocsparse_status_success; } + // Check leading dimension of B + if(trans_B == rocsparse_operation_none) + { + if(trans_A == rocsparse_operation_none) + { + if(ldb < std::max(1, k)) + { + return rocsparse_status_invalid_size; + } + } + else + { + if(ldb < std::max(1, m)) + { + return rocsparse_status_invalid_size; + } + } + } + else + { + if(ldb < std::max(1, n)) + { + return rocsparse_status_invalid_size; + } + } + + // Check leading dimension of C + if(trans_A == rocsparse_operation_none) + { + if(ldc < std::max(1, m)) + { + return rocsparse_status_invalid_size; + } + } + else + { + if(ldc < std::max(1, k)) + { + return rocsparse_status_invalid_size; + } + } + // Stream hipStream_t stream = handle->stream; // Run different csrmv kernels - if(trans == rocsparse_operation_none) + if(trans_A == rocsparse_operation_none) { -#define CSRMMN_DIM 512 - dim3 csrmmn_blocks((m - 1) / CSRMMN_DIM + 1); - dim3 csrmmn_threads(CSRMMN_DIM); - - if(handle->pointer_mode == rocsparse_pointer_mode_device) + if(trans_B == rocsparse_operation_none) { +#define CSRMMNN_DIM 256 +#define SUB_WF_SIZE 8 + dim3 csrmmnn_blocks((SUB_WF_SIZE * m - 1) / CSRMMNN_DIM + 1, (n - 1) / SUB_WF_SIZE + 1); + dim3 csrmmnn_threads(CSRMMNN_DIM); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + hipLaunchKernelGGL((csrmmnn_kernel_device_pointer), + csrmmnn_blocks, + csrmmnn_threads, + 0, + stream, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + descr->base); + } + else + { + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + + hipLaunchKernelGGL((csrmmnn_kernel_host_pointer), + csrmmnn_blocks, + csrmmnn_threads, + 0, + stream, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + descr->base); + } +#undef SUB_WF_SIZE +#undef CSRMMNN_DIM } - else + else if(trans_B == rocsparse_operation_transpose) { - if(*alpha == 0.0 && *beta == 1.0) + // Average nnz per row of A + rocsparse_int avg_row_nnz = (nnz - 1) / m + 1; + +#define CSRMMNT_DIM 256 + if(handle->pointer_mode == rocsparse_pointer_mode_device) { - return rocsparse_status_success; + // Computation is split into two parts, main and remainder + // First step: Compute main, which is the maximum number of + // columns of B that is dividable by the next + // power of two of the average row nnz of A. + // Second step: Compute remainder, which is the remaining + // columns of B. + rocsparse_int main = 0; + rocsparse_int remainder = 0; + + // Launch appropriate kernel depending on row nnz of A + if(avg_row_nnz < 16) + { + remainder = n % 8; + main = n - remainder; + + // Launch main kernel if enough columns of B + if(main > 0) + { + hipLaunchKernelGGL((csrmmnt_kernel_device_pointer), + dim3((8 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + 0, + main, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + descr->base); + } + } + else if(avg_row_nnz < 32) + { + remainder = n % 16; + main = n - remainder; + + // Launch main kernel if enough columns of B + if(main > 0) + { + hipLaunchKernelGGL((csrmmnt_kernel_device_pointer), + dim3((16 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + 0, + main, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + descr->base); + } + } + else if(avg_row_nnz < 64 || handle->warp_size == 32) + { + remainder = n % 32; + main = n - remainder; + + // Launch main kernel if enough columns of B + if(main > 0) + { + hipLaunchKernelGGL((csrmmnt_kernel_device_pointer), + dim3((32 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + 0, + main, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + descr->base); + } + } + else if(handle->warp_size == 64) + { + remainder = n % 64; + main = n - remainder; + + // Launch main kernel if enough columns of B + if(main > 0) + { + hipLaunchKernelGGL((csrmmnt_kernel_device_pointer), + dim3((64 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + 0, + main, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + descr->base); + } + } + else + { + return rocsparse_status_arch_mismatch; + } + + // Process remainder + if(remainder > 0) + { + if(remainder <= 8) + { + hipLaunchKernelGGL((csrmmnt_kernel_device_pointer), + dim3((8 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + main, + n, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + descr->base); + } + else if(remainder <= 16) + { + hipLaunchKernelGGL((csrmmnt_kernel_device_pointer), + dim3((16 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + main, + n, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + descr->base); + } + else if(remainder <= 32 || handle->warp_size == 32) + { + hipLaunchKernelGGL((csrmmnt_kernel_device_pointer), + dim3((32 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + main, + n, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + descr->base); + } + else if(remainder <= 64) + { + hipLaunchKernelGGL((csrmmnt_kernel_device_pointer), + dim3((64 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + main, + n, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + descr->base); + } + else + { + return rocsparse_status_arch_mismatch; + } + } } + else + { + // Quick return + if(*alpha == 0.0 && *beta == 1.0) + { + return rocsparse_status_success; + } + + rocsparse_int main = 0; + rocsparse_int remainder = 0; + + // Launch appropriate kernel + if(avg_row_nnz < 16) + { + remainder = n % 8; + main = n - remainder; + + // Launch main kernel if enough columns of B + if(main > 0) + { + hipLaunchKernelGGL((csrmmnt_kernel_host_pointer), + dim3((8 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + 0, + main, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + descr->base); + } + } + else if(avg_row_nnz < 32) + { + remainder = n % 16; + main = n - remainder; + + // Launch main kernel if enough columns of B + if(main > 0) + { + hipLaunchKernelGGL((csrmmnt_kernel_host_pointer), + dim3((16 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + 0, + main, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + descr->base); + } + } + else if(avg_row_nnz < 64 || handle->warp_size == 32) + { + remainder = n % 32; + main = n - remainder; + + // Launch main kernel if enough columns of B + if(main > 0) + { + hipLaunchKernelGGL((csrmmnt_kernel_host_pointer), + dim3((32 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + 0, + main, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + descr->base); + } + } + else if(handle->warp_size == 64) + { + remainder = n % 64; + main = n - remainder; + + // Launch main kernel if enough columns of B + if(main > 0) + { + hipLaunchKernelGGL((csrmmnt_kernel_host_pointer), + dim3((64 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + 0, + main, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + descr->base); + } + } + else + { + return rocsparse_status_arch_mismatch; + } + + // Process remainder + if(remainder > 0) + { + if(remainder <= 8) + { + hipLaunchKernelGGL((csrmmnt_kernel_host_pointer), + dim3((8 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + main, + n, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + descr->base); + } + else if(remainder <= 16) + { + hipLaunchKernelGGL((csrmmnt_kernel_host_pointer), + dim3((16 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + main, + n, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + descr->base); + } + else if(remainder <= 32 || handle->warp_size == 32) + { + hipLaunchKernelGGL((csrmmnt_kernel_host_pointer), + dim3((32 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + main, + n, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + descr->base); + } + else if(remainder <= 64) + { + hipLaunchKernelGGL((csrmmnt_kernel_host_pointer), + dim3((64 * m - 1) / CSRMMNT_DIM + 1), + dim3(CSRMMNT_DIM), + 0, + stream, + main, + n, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + descr->base); + } + else + { + return rocsparse_status_arch_mismatch; + } + } + } +#undef CSRMMNT_DIM + } + else + { + return rocsparse_status_not_implemented; } -#undef CSRMM_DIM } else { From f689bf850bb69c518d27aed831b648b39efbdecf Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 15:59:39 +0200 Subject: [PATCH 200/304] tests for csrmm --- .../rocsparse_template_specialization.cpp | 10 +- clients/common/unit.cpp | 2 +- clients/include/rocsparse.hpp | 3 +- clients/include/testing_csrmm.hpp | 307 ++++++++++-------- clients/tests/test_csrmm.cpp | 115 ++++++- 5 files changed, 289 insertions(+), 148 deletions(-) diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 160e18ae..24eab0a5 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -288,7 +288,8 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, template <> rocsparse_status rocsparse_csrmm(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, @@ -304,12 +305,13 @@ rocsparse_status rocsparse_csrmm(rocsparse_handle handle, float* C, rocsparse_int ldc) { - return rocsparse_scsrmm(handle, trans, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); + return rocsparse_scsrmm(handle, trans_A, trans_B, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); } template <> rocsparse_status rocsparse_csrmm(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, @@ -325,7 +327,7 @@ rocsparse_status rocsparse_csrmm(rocsparse_handle handle, double* C, rocsparse_int ldc) { - return rocsparse_dcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); + return rocsparse_dcsrmm(handle, trans_A, trans_B, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); } template <> diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index de6149ed..ef5824c3 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -128,7 +128,7 @@ void unit_check_near( { for(rocsparse_int i = 0; i < M; i++) { - double compare_val = std::max(std::abs(hCPU[i + j * lda] * 1e-12), + double compare_val = std::max(std::abs(hCPU[i + j * lda] * 1e-10), 10 * std::numeric_limits::epsilon()); #ifdef GOOGLE_TEST ASSERT_NEAR(hCPU[i + j * lda], hGPU[i + j * lda], compare_val); diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index a2611d1d..316d42eb 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -117,7 +117,8 @@ rocsparse_status rocsparse_hybmv(rocsparse_handle handle, template rocsparse_status rocsparse_csrmm(rocsparse_handle handle, - rocsparse_operation trans, + rocsparse_operation trans_A, + rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n, rocsparse_int k, diff --git a/clients/include/testing_csrmm.hpp b/clients/include/testing_csrmm.hpp index c5c8a85e..6c0cf23d 100644 --- a/clients/include/testing_csrmm.hpp +++ b/clients/include/testing_csrmm.hpp @@ -21,16 +21,17 @@ template void testing_csrmm_bad_arg(void) { - rocsparse_int n = 100; - rocsparse_int m = 100; - rocsparse_int k = 100; - rocsparse_int ldb = 100; - rocsparse_int ldc = 100; - rocsparse_int nnz = 100; - rocsparse_int safe_size = 100; - T alpha = 0.6; - T beta = 0.2; - rocsparse_operation trans = rocsparse_operation_none; + rocsparse_int N = 100; + rocsparse_int M = 100; + rocsparse_int K = 100; + rocsparse_int ldb = 100; + rocsparse_int ldc = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + T alpha = 0.6; + T beta = 0.2; + rocsparse_operation transA = rocsparse_operation_none; + rocsparse_operation transB = rocsparse_operation_none; rocsparse_status status; std::unique_ptr unique_ptr_handle(new handle_struct); @@ -64,7 +65,7 @@ void testing_csrmm_bad_arg(void) rocsparse_int* dptr_null = nullptr; status = rocsparse_csrmm( - handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr_null, dcol, dB, ldb, &beta, dC, ldc); + handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr_null, dcol, dB, ldb, &beta, dC, ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); } // testing for(nullptr == dcol) @@ -72,7 +73,7 @@ void testing_csrmm_bad_arg(void) rocsparse_int* dcol_null = nullptr; status = rocsparse_csrmm( - handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol_null, dB, ldb, &beta, dC, ldc); + handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol_null, dB, ldb, &beta, dC, ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } // testing for(nullptr == dval) @@ -80,7 +81,7 @@ void testing_csrmm_bad_arg(void) T* dval_null = nullptr; status = rocsparse_csrmm( - handle, trans, m, n, k, nnz, &alpha, descr, dval_null, dptr, dcol, dB, ldb, &beta, dC, ldc); + handle, transA, transB, M, N, K, nnz, &alpha, descr, dval_null, dptr, dcol, dB, ldb, &beta, dC, ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); } // testing for(nullptr == dB) @@ -88,7 +89,7 @@ void testing_csrmm_bad_arg(void) T* dB_null = nullptr; status = rocsparse_csrmm( - handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol, dB_null, ldb, &beta, dC, ldc); + handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol, dB_null, ldb, &beta, dC, ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dB is nullptr"); } // testing for(nullptr == dC) @@ -96,7 +97,7 @@ void testing_csrmm_bad_arg(void) T* dC_null = nullptr; status = rocsparse_csrmm( - handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, &beta, dC_null, ldc); + handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, &beta, dC_null, ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dC is nullptr"); } // testing for(nullptr == d_alpha) @@ -104,7 +105,7 @@ void testing_csrmm_bad_arg(void) T* d_alpha_null = nullptr; status = rocsparse_csrmm( - handle, trans, m, n, k, nnz, d_alpha_null, descr, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); + handle, transA, transB, M, N, K, nnz, d_alpha_null, descr, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for(nullptr == d_beta) @@ -112,7 +113,7 @@ void testing_csrmm_bad_arg(void) T* d_beta_null = nullptr; status = rocsparse_csrmm( - handle, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, d_beta_null, dC, ldc); + handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, d_beta_null, dC, ldc); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } // testing for(nullptr == descr) @@ -120,7 +121,7 @@ void testing_csrmm_bad_arg(void) rocsparse_mat_descr descr_null = nullptr; status = rocsparse_csrmm( - handle, trans, m, n, k, nnz, &alpha, descr_null, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); + handle, transA, transB, M, N, K, nnz, &alpha, descr_null, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == handle) @@ -128,7 +129,7 @@ void testing_csrmm_bad_arg(void) rocsparse_handle handle_null = nullptr; status = rocsparse_csrmm( - handle_null, trans, m, n, k, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); + handle_null, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); verify_rocsparse_status_invalid_handle(status); } } @@ -136,16 +137,34 @@ void testing_csrmm_bad_arg(void) template rocsparse_status testing_csrmm(Arguments argus) { -/* rocsparse_int safe_size = 100; - rocsparse_int m = argus.M; - rocsparse_int n = argus.N; + rocsparse_int M = argus.M; + rocsparse_int N = argus.N; + rocsparse_int K = argus.K; + rocsparse_int ldb = argus.ldb; + rocsparse_int ldc = argus.ldc; T h_alpha = argus.alpha; T h_beta = argus.beta; - rocsparse_operation trans = argus.trans; + rocsparse_operation transA = argus.transA; + rocsparse_operation transB = argus.transB; rocsparse_index_base idx_base = argus.idx_base; + std::string binfile = ""; + std::string filename = ""; rocsparse_status status; + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(M == -99 && K == -99 && argus.timing == 0) + { + binfile = argus.filename; + M = K = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + std::unique_ptr test_handle(new handle_struct); rocsparse_handle handle = test_handle->handle; @@ -157,190 +176,215 @@ rocsparse_status testing_csrmm(Arguments argus) // Determine number of non-zero elements double scale = 0.02; - if(m > 1000 || n > 1000) + if(M > 1000 || K > 1000) { - scale = 2.0 / std::max(m, n); + scale = 2.0 / std::max(M, K); } - rocsparse_int nnz = m * scale * n; + rocsparse_int nnz = M * scale * K; // Argument sanity check before allocating invalid memory - if(m <= 0 || n <= 0 || nnz <= 0) + if(M <= 0 || N <= 0 || K <= 0 || nnz <= 0) { auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto dcol_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; - auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; - auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dB_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dC_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); T* dval = (T*)dval_managed.get(); - T* dx = (T*)dx_managed.get(); - T* dy = (T*)dy_managed.get(); + T* dB = (T*)dB_managed.get(); + T* dC = (T*)dC_managed.get(); - if(!dval || !dptr || !dcol || !dx || !dy) + if(!dval || !dptr || !dcol || !dB || !dC) { verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dptr || !dcol || !dval || !dx || !dy"); + "!dptr || !dcol || !dval || !dB || !dC"); return rocsparse_status_memory_error; } CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - status = rocsparse_csrmm( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy); + status = rocsparse_csrmm(handle, transA, transB, M, N, K, nnz, &h_alpha, descr, dval, dptr, dcol, dB, ldb, &h_beta, dC, ldc); - if(m < 0 || n < 0 || nnz < 0) + if(M < 0 || N < 0 || K < 0 || nnz < 0) { - verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + verify_rocsparse_status_invalid_size(status, "Error: M < 0 || N < 0 || K < 0 || nnz < 0"); } else { - verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + verify_rocsparse_status_success(status, "M >= 0 && N >= 0 && K >= 0 && nnz >= 0"); } return rocsparse_status_success; } - // Host structures - std::vector hcsr_row_ptr; - std::vector hcoo_row_ind; - std::vector hcol_ind; - std::vector hval; + // Initialize random seed + srand(12345ULL); + + // Host structures - CSR matrix A + std::vector hcsr_row_ptrA; + std::vector hcsr_col_indA; + std::vector hcsr_valA; // Initial Data on CPU - srand(12345ULL); - if(argus.laplacian) + if(binfile != "") { - m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcol_ind, hval, idx_base); - nnz = hcsr_row_ptr[m]; + if(read_bin_matrix(binfile.c_str(), M, K, nnz, hcsr_row_ptrA, hcsr_col_indA, hcsr_valA, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) + { + M = K = gen_2d_laplacian(argus.laplacian, hcsr_row_ptrA, hcsr_col_indA, hcsr_valA, idx_base); + nnz = hcsr_row_ptrA[M]; } else { - if(argus.filename != "") + std::vector hcoo_row_indA; + + if(filename != "") { - if(read_mtx_matrix(argus.filename.c_str(), m, n, nnz, hcoo_row_ind, hcol_ind, hval) != - 0) + if(read_mtx_matrix(filename.c_str(), M, K, nnz, hcoo_row_indA, hcsr_col_indA, hcsr_valA, idx_base) != 0) { - fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; } } else { - gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcol_ind, hval, idx_base); + gen_matrix_coo(M, K, nnz, hcoo_row_indA, hcsr_col_indA, hcsr_valA, idx_base); } // Convert COO to CSR - if(!argus.laplacian) + hcsr_row_ptrA.resize(M + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) { - hcsr_row_ptr.resize(m + 1, 0); - for(rocsparse_int i = 0; i < nnz; ++i) - { - ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; - } + ++hcsr_row_ptrA[hcoo_row_indA[i] + 1 - idx_base]; + } - hcsr_row_ptr[0] = idx_base; - for(rocsparse_int i = 0; i < m; ++i) - { - hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; - } + hcsr_row_ptrA[0] = idx_base; + for(rocsparse_int i = 0; i < M; ++i) + { + hcsr_row_ptrA[i + 1] += hcsr_row_ptrA[i]; } } - std::vector hx(n); - std::vector hy_1(m); - std::vector hy_2(m); - std::vector hy_gold(m); + if(transB == rocsparse_operation_none) + { + ldb = (transA == rocsparse_operation_none) ? K : M; + } + else + { + ldb = N; + } + + ldc = (transA == rocsparse_operation_none) ? M : K; + + rocsparse_int Anrow = M; + rocsparse_int Ancol = K; + rocsparse_int Bnrow = ldb; + rocsparse_int Bncol = (transB == rocsparse_operation_none) ? N : K; + rocsparse_int Bnnz = Bnrow * Bncol; + rocsparse_int Cnrow = ldc; + rocsparse_int Cncol = N; + rocsparse_int Cnnz = Cnrow * Cncol; - rocsparse_init(hx, 1, n); - rocsparse_init(hy_1, 1, m); + // Host structures - Dense matrix B and C + std::vector hB(Bnnz); + std::vector hC_1(Cnnz); + std::vector hC_2(Cnnz); + std::vector hC_gold(Cnnz); - // copy vector is easy in STL; hy_gold = hx: save a copy in hy_gold which will be output of CPU - hy_2 = hy_1; - hy_gold = hy_1; + rocsparse_init(hB, Bnrow, Bncol); + rocsparse_init(hC_1, Cnrow, Cncol); + + // copy vector is easy in STL; hC_gold = hC_1: save a copy in hy_gold which will be output of CPU + hC_gold = hC_1; + hC_2 = hC_1; // allocate memory on device - auto dptr_managed = - rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; - auto dcol_managed = + auto dcsr_row_ptrA_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (M + 1)), device_free}; + auto dcsr_col_indA_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; - auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; - auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * n), device_free}; - auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; - auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dcsr_valA_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dB_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * Bnnz), device_free}; + auto dC_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * Cnnz), device_free}; + auto dC_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * Cnnz), device_free}; auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; auto d_beta_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; - rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); - rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); - T* dval = (T*)dval_managed.get(); - T* dx = (T*)dx_managed.get(); - T* dy_1 = (T*)dy_1_managed.get(); - T* dy_2 = (T*)dy_2_managed.get(); - T* d_alpha = (T*)d_alpha_managed.get(); - T* d_beta = (T*)d_beta_managed.get(); + rocsparse_int* dcsr_row_ptrA = (rocsparse_int*)dcsr_row_ptrA_managed.get(); + rocsparse_int* dcsr_col_indA = (rocsparse_int*)dcsr_col_indA_managed.get(); + T* dcsr_valA = (T*)dcsr_valA_managed.get(); + T* dB = (T*)dB_managed.get(); + T* dC_1 = (T*)dC_1_managed.get(); + T* dC_2 = (T*)dC_2_managed.get(); + T* d_alpha = (T*)d_alpha_managed.get(); + T* d_beta = (T*)d_beta_managed.get(); - if(!dval || !dptr || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_beta) + if(!dcsr_valA || !dcsr_row_ptrA || !dcsr_col_indA || !dB || !dC_1 || !d_alpha || !d_beta) { verify_rocsparse_status_success(rocsparse_status_memory_error, - "!dval || !dptr || !dcol || !dx || " - "!dy_1 || !dy_2 || !d_alpha || !d_beta"); + "!dcsr_valA || !dcsr_row_ptrA || !dcsr_col_indA || !dB || " + "!dC_1 || !d_alpha || !d_beta"); return rocsparse_status_memory_error; } // copy data from CPU to device - CHECK_HIP_ERROR(hipMemcpy( - dptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR( - hipMemcpy(dcol, hcol_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dval, hval.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * n, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * m, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_row_ptrA, hcsr_row_ptrA.data(), sizeof(rocsparse_int) * (M + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_col_indA, hcsr_col_indA.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_valA, hcsr_valA.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(T) * Bnnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dC_1, hC_1.data(), sizeof(T) * Cnnz, hipMemcpyHostToDevice)); if(argus.unit_check) { - CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dC_2, hC_2.data(), sizeof(T) * Cnnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR(rocsparse_csrmm( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrmm(handle, transA, transB, Anrow, Cncol, Ancol, nnz, &h_alpha, descr, dcsr_valA, dcsr_row_ptrA, dcsr_col_indA, dB, ldb, &h_beta, dC_1, ldc)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); - CHECK_ROCSPARSE_ERROR(rocsparse_csrmm( - handle, trans, m, n, nnz, d_alpha, descr, dval, dptr, dcol, dx, d_beta, dy_2)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrmm(handle, transA, transB, Anrow, Cncol, Ancol, nnz, d_alpha, descr, dcsr_valA, dcsr_row_ptrA, dcsr_col_indA, dB, ldb, d_beta, dC_2, ldc)); // copy output from device to CPU - CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * m, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hC_1.data(), dC_1, sizeof(T) * Cnnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hC_2.data(), dC_2, sizeof(T) * Cnnz, hipMemcpyDeviceToHost)); // CPU double cpu_time_used = get_time_us(); - for(rocsparse_int i = 0; i < m; ++i) + for(rocsparse_int i = 0; i < Cnrow; ++i) { - hy_gold[i] *= h_beta; - for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; - ++j) + for(rocsparse_int j = 0; j < Cncol; ++j) { - hy_gold[i] += h_alpha * hval[j] * hx[hcol_ind[j] - idx_base]; + rocsparse_int Cidx = i + j * ldc; + T sum = hC_gold[Cidx] * h_beta; + + for(rocsparse_int k = hcsr_row_ptrA[i] - idx_base; k < hcsr_row_ptrA[i + 1] - idx_base; ++k) + { + rocsparse_int Bidx = (transB == rocsparse_operation_none) ? (hcsr_col_indA[k] - idx_base + j * ldb) : (j + (hcsr_col_indA[k] - idx_base) * ldb); + sum += h_alpha * hcsr_valA[k] * hB[Bidx]; + } + + hC_gold[Cidx] = sum; } } cpu_time_used = get_time_us() - cpu_time_used; - // enable unit check, notice unit check is not invasive, but norm check is, - // unit check and norm check can not be interchanged their order - if(argus.unit_check) - { - unit_check_general(1, m, hy_gold.data(), hy_1.data()); - unit_check_general(1, m, hy_gold.data(), hy_2.data()); - } + unit_check_near(Cnrow, Cncol, ldc, hC_gold.data(), hC_1.data()); + unit_check_near(Cnrow, Cncol, ldc, hC_gold.data(), hC_2.data()); } if(argus.timing) @@ -351,32 +395,31 @@ rocsparse_status testing_csrmm(Arguments argus) for(int iter = 0; iter < number_cold_calls; iter++) { - rocsparse_csrmm( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); + rocsparse_csrmm(handle, transA, transB, Anrow, Cncol, Ancol, nnz, &h_alpha, descr, dcsr_valA, dcsr_row_ptrA, dcsr_col_indA, dB, ldb, &h_beta, dC_1, ldc); } double gpu_time_used = get_time_us(); // in microseconds for(int iter = 0; iter < number_hot_calls; iter++) { - rocsparse_csrmm( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); + rocsparse_csrmm(handle, transA, transB, Anrow, Cncol, Ancol, nnz, &h_alpha, descr, dcsr_valA, dcsr_row_ptrA, dcsr_col_indA, dB, ldb, &h_beta, dC_1, ldc); } // Convert to miliseconds per call gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); - size_t flops = (h_alpha != 1.0) ? 3.0 * nnz : 2.0 * nnz; - flops = (h_beta != 0.0) ? flops + m : flops; + size_t flops = 3.0 * nnz * Bncol; + flops = (h_beta != 0.0) ? flops + Cnnz : flops; double gpu_gflops = flops / gpu_time_used / 1e6; - size_t memtrans = 2.0 * m + nnz; - memtrans = (h_beta != 0.0) ? memtrans + m : memtrans; + size_t memtrans = nnz + Cnnz + Bnnz; + memtrans = (h_beta != 0.0) ? memtrans + Cnnz : memtrans; double bandwidth = - (memtrans * sizeof(T) + (m + 1 + nnz) * sizeof(rocsparse_int)) / gpu_time_used / 1e6; + (memtrans * sizeof(T) + (M + 1 + nnz) * sizeof(rocsparse_int)) / gpu_time_used / 1e6; - printf("m\t\tn\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tmsec\n"); - printf("%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", - m, - n, + printf("m\t\tn\t\tk\t\tnnz\t\talpha\tbeta\tGFlops\tGB/s\tmsec\n"); + printf("%8d\t%8d\t%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + M, + N, + K, nnz, h_alpha, h_beta, @@ -384,7 +427,7 @@ rocsparse_status testing_csrmm(Arguments argus) bandwidth, gpu_time_used); } -*/ + return rocsparse_status_success; } diff --git a/clients/tests/test_csrmm.cpp b/clients/tests/test_csrmm.cpp index bdb87a1d..0f14d62f 100644 --- a/clients/tests/test_csrmm.cpp +++ b/clients/tests/test_csrmm.cpp @@ -7,18 +7,38 @@ #include #include -#include +#include typedef rocsparse_index_base base; -typedef std::tuple csrmm_tuple; +typedef rocsparse_operation trans; +typedef std::tuple csrmm_tuple; +typedef std::tuple csrmm_bin_tuple; -int csrmm_M_range[] = {-1, 0, 10, 500, 7111, 10000}; -int csrmm_N_range[] = {-3, 0, 33, 842, 4441, 10000}; +int csrmm_M_range[] = {-1, 0, 42, 511, 3521}; +int csrmm_N_range[] = {-1, 0, 13, 33, 64, 73}; +int csrmm_K_range[] = {-1, 0, 50, 254, 1942}; -std::vector csrmm_alpha_range = {2.0, 3.0}; -std::vector csrmm_beta_range = {0.0, 1.0}; +double csrmm_alpha_range[] = {-1.0, 0.0, 3.3}; +double csrmm_beta_range[] = {-0.3, 0.0, 1.0}; base csrmm_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +trans csrmm_transA_range[] = {rocsparse_operation_none}; +trans csrmm_transB_range[] = {rocsparse_operation_none, rocsparse_operation_transpose}; + +std::string csrmm_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; class parameterized_csrmm : public testing::TestWithParam { @@ -29,18 +49,64 @@ class parameterized_csrmm : public testing::TestWithParam virtual void TearDown() {} }; +class parameterized_csrmm_bin : public testing::TestWithParam +{ + protected: + parameterized_csrmm_bin() {} + virtual ~parameterized_csrmm_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + Arguments setup_csrmm_arguments(csrmm_tuple tup) { Arguments arg; arg.M = std::get<0>(tup); arg.N = std::get<1>(tup); - arg.alpha = std::get<2>(tup); - arg.beta = std::get<3>(tup); - arg.idx_base = std::get<4>(tup); + arg.K = std::get<2>(tup); + arg.alpha = std::get<3>(tup); + arg.beta = std::get<4>(tup); + arg.idx_base = std::get<5>(tup); + arg.transA = std::get<6>(tup); + arg.transB = std::get<7>(tup); arg.timing = 0; return arg; } +Arguments setup_csrmm_arguments(csrmm_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = std::get<0>(tup); + arg.K = -99; + arg.alpha = std::get<1>(tup); + arg.beta = std::get<2>(tup); + arg.idx_base = std::get<3>(tup); + arg.transA = std::get<4>(tup); + arg.transB = std::get<5>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<6>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + TEST(csrmm_bad_arg, csrmm_float) { testing_csrmm_bad_arg(); } TEST_P(parameterized_csrmm, csrmm_float) @@ -59,10 +125,39 @@ TEST_P(parameterized_csrmm, csrmm_double) EXPECT_EQ(status, rocsparse_status_success); } +TEST_P(parameterized_csrmm_bin, csrmm_bin_float) +{ + Arguments arg = setup_csrmm_arguments(GetParam()); + + rocsparse_status status = testing_csrmm(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrmm_bin, csrmm_bin_double) +{ + Arguments arg = setup_csrmm_arguments(GetParam()); + + rocsparse_status status = testing_csrmm(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + INSTANTIATE_TEST_CASE_P(csrmm, parameterized_csrmm, testing::Combine(testing::ValuesIn(csrmm_M_range), testing::ValuesIn(csrmm_N_range), + testing::ValuesIn(csrmm_K_range), + testing::ValuesIn(csrmm_alpha_range), + testing::ValuesIn(csrmm_beta_range), + testing::ValuesIn(csrmm_idxbase_range), + testing::ValuesIn(csrmm_transA_range), + testing::ValuesIn(csrmm_transB_range))); + +INSTANTIATE_TEST_CASE_P(csrmm_bin, + parameterized_csrmm_bin, + testing::Combine(testing::ValuesIn(csrmm_N_range), testing::ValuesIn(csrmm_alpha_range), testing::ValuesIn(csrmm_beta_range), - testing::ValuesIn(csrmm_idxbase_range))); + testing::ValuesIn(csrmm_idxbase_range), + testing::ValuesIn(csrmm_transA_range), + testing::ValuesIn(csrmm_transB_range), + testing::ValuesIn(csrmm_bin))); From 130dc6a1fa0cd93d2234f69f1fef486805b5ae85 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 31 Jul 2018 16:03:58 +0200 Subject: [PATCH 201/304] clang-format --- .../rocsparse_template_specialization.cpp | 36 +- clients/include/testing_csrmm.hpp | 314 +++++++++++++++--- library/src/level3/csrmm_device.h | 16 +- library/src/level3/rocsparse_csrmm.cpp | 38 ++- library/src/level3/rocsparse_csrmm.hpp | 208 +++++++----- 5 files changed, 471 insertions(+), 141 deletions(-) diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 24eab0a5..b1a85cc7 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -305,7 +305,23 @@ rocsparse_status rocsparse_csrmm(rocsparse_handle handle, float* C, rocsparse_int ldc) { - return rocsparse_scsrmm(handle, trans_A, trans_B, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); + return rocsparse_scsrmm(handle, + trans_A, + trans_B, + m, + n, + k, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + B, + ldb, + beta, + C, + ldc); } template <> @@ -327,7 +343,23 @@ rocsparse_status rocsparse_csrmm(rocsparse_handle handle, double* C, rocsparse_int ldc) { - return rocsparse_dcsrmm(handle, trans_A, trans_B, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); + return rocsparse_dcsrmm(handle, + trans_A, + trans_B, + m, + n, + k, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + B, + ldb, + beta, + C, + ldc); } template <> diff --git a/clients/include/testing_csrmm.hpp b/clients/include/testing_csrmm.hpp index 6c0cf23d..67bd75a3 100644 --- a/clients/include/testing_csrmm.hpp +++ b/clients/include/testing_csrmm.hpp @@ -64,72 +64,207 @@ void testing_csrmm_bad_arg(void) { rocsparse_int* dptr_null = nullptr; - status = rocsparse_csrmm( - handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr_null, dcol, dB, ldb, &beta, dC, ldc); + status = rocsparse_csrmm(handle, + transA, + transB, + M, + N, + K, + nnz, + &alpha, + descr, + dval, + dptr_null, + dcol, + dB, + ldb, + &beta, + dC, + ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); } // testing for(nullptr == dcol) { rocsparse_int* dcol_null = nullptr; - status = rocsparse_csrmm( - handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol_null, dB, ldb, &beta, dC, ldc); + status = rocsparse_csrmm(handle, + transA, + transB, + M, + N, + K, + nnz, + &alpha, + descr, + dval, + dptr, + dcol_null, + dB, + ldb, + &beta, + dC, + ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } // testing for(nullptr == dval) { T* dval_null = nullptr; - status = rocsparse_csrmm( - handle, transA, transB, M, N, K, nnz, &alpha, descr, dval_null, dptr, dcol, dB, ldb, &beta, dC, ldc); + status = rocsparse_csrmm(handle, + transA, + transB, + M, + N, + K, + nnz, + &alpha, + descr, + dval_null, + dptr, + dcol, + dB, + ldb, + &beta, + dC, + ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); } // testing for(nullptr == dB) { T* dB_null = nullptr; - status = rocsparse_csrmm( - handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol, dB_null, ldb, &beta, dC, ldc); + status = rocsparse_csrmm(handle, + transA, + transB, + M, + N, + K, + nnz, + &alpha, + descr, + dval, + dptr, + dcol, + dB_null, + ldb, + &beta, + dC, + ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dB is nullptr"); } // testing for(nullptr == dC) { T* dC_null = nullptr; - status = rocsparse_csrmm( - handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, &beta, dC_null, ldc); + status = rocsparse_csrmm(handle, + transA, + transB, + M, + N, + K, + nnz, + &alpha, + descr, + dval, + dptr, + dcol, + dB, + ldb, + &beta, + dC_null, + ldc); verify_rocsparse_status_invalid_pointer(status, "Error: dC is nullptr"); } // testing for(nullptr == d_alpha) { T* d_alpha_null = nullptr; - status = rocsparse_csrmm( - handle, transA, transB, M, N, K, nnz, d_alpha_null, descr, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); + status = rocsparse_csrmm(handle, + transA, + transB, + M, + N, + K, + nnz, + d_alpha_null, + descr, + dval, + dptr, + dcol, + dB, + ldb, + &beta, + dC, + ldc); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for(nullptr == d_beta) { T* d_beta_null = nullptr; - status = rocsparse_csrmm( - handle, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, d_beta_null, dC, ldc); + status = rocsparse_csrmm(handle, + transA, + transB, + M, + N, + K, + nnz, + &alpha, + descr, + dval, + dptr, + dcol, + dB, + ldb, + d_beta_null, + dC, + ldc); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } // testing for(nullptr == descr) { rocsparse_mat_descr descr_null = nullptr; - status = rocsparse_csrmm( - handle, transA, transB, M, N, K, nnz, &alpha, descr_null, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); + status = rocsparse_csrmm(handle, + transA, + transB, + M, + N, + K, + nnz, + &alpha, + descr_null, + dval, + dptr, + dcol, + dB, + ldb, + &beta, + dC, + ldc); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == handle) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csrmm( - handle_null, transA, transB, M, N, K, nnz, &alpha, descr, dval, dptr, dcol, dB, ldb, &beta, dC, ldc); + status = rocsparse_csrmm(handle_null, + transA, + transB, + M, + N, + K, + nnz, + &alpha, + descr, + dval, + dptr, + dcol, + dB, + ldb, + &beta, + dC, + ldc); verify_rocsparse_status_invalid_handle(status); } } @@ -207,11 +342,28 @@ rocsparse_status testing_csrmm(Arguments argus) } CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - status = rocsparse_csrmm(handle, transA, transB, M, N, K, nnz, &h_alpha, descr, dval, dptr, dcol, dB, ldb, &h_beta, dC, ldc); + status = rocsparse_csrmm(handle, + transA, + transB, + M, + N, + K, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + dB, + ldb, + &h_beta, + dC, + ldc); if(M < 0 || N < 0 || K < 0 || nnz < 0) { - verify_rocsparse_status_invalid_size(status, "Error: M < 0 || N < 0 || K < 0 || nnz < 0"); + verify_rocsparse_status_invalid_size(status, + "Error: M < 0 || N < 0 || K < 0 || nnz < 0"); } else { @@ -232,7 +384,8 @@ rocsparse_status testing_csrmm(Arguments argus) // Initial Data on CPU if(binfile != "") { - if(read_bin_matrix(binfile.c_str(), M, K, nnz, hcsr_row_ptrA, hcsr_col_indA, hcsr_valA, idx_base) != 0) + if(read_bin_matrix( + binfile.c_str(), M, K, nnz, hcsr_row_ptrA, hcsr_col_indA, hcsr_valA, idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); return rocsparse_status_internal_error; @@ -240,7 +393,8 @@ rocsparse_status testing_csrmm(Arguments argus) } else if(argus.laplacian) { - M = K = gen_2d_laplacian(argus.laplacian, hcsr_row_ptrA, hcsr_col_indA, hcsr_valA, idx_base); + M = K = + gen_2d_laplacian(argus.laplacian, hcsr_row_ptrA, hcsr_col_indA, hcsr_valA, idx_base); nnz = hcsr_row_ptrA[M]; } else @@ -249,7 +403,14 @@ rocsparse_status testing_csrmm(Arguments argus) if(filename != "") { - if(read_mtx_matrix(filename.c_str(), M, K, nnz, hcoo_row_indA, hcsr_col_indA, hcsr_valA, idx_base) != 0) + if(read_mtx_matrix(filename.c_str(), + M, + K, + nnz, + hcoo_row_indA, + hcsr_col_indA, + hcsr_valA, + idx_base) != 0) { fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); return rocsparse_status_internal_error; @@ -303,21 +464,22 @@ rocsparse_status testing_csrmm(Arguments argus) rocsparse_init(hB, Bnrow, Bncol); rocsparse_init(hC_1, Cnrow, Cncol); - // copy vector is easy in STL; hC_gold = hC_1: save a copy in hy_gold which will be output of CPU + // copy vector is easy in STL; hC_gold = hC_1: save a copy in hy_gold which will be output of + // CPU hC_gold = hC_1; - hC_2 = hC_1; + hC_2 = hC_1; // allocate memory on device auto dcsr_row_ptrA_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (M + 1)), device_free}; auto dcsr_col_indA_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; - auto dcsr_valA_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; - auto dB_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * Bnnz), device_free}; - auto dC_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * Cnnz), device_free}; - auto dC_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * Cnnz), device_free}; - auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; - auto d_beta_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto dcsr_valA_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dB_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * Bnnz), device_free}; + auto dC_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * Cnnz), device_free}; + auto dC_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * Cnnz), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto d_beta_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; rocsparse_int* dcsr_row_ptrA = (rocsparse_int*)dcsr_row_ptrA_managed.get(); rocsparse_int* dcsr_col_indA = (rocsparse_int*)dcsr_col_indA_managed.get(); @@ -337,8 +499,12 @@ rocsparse_status testing_csrmm(Arguments argus) } // copy data from CPU to device - CHECK_HIP_ERROR(hipMemcpy(dcsr_row_ptrA, hcsr_row_ptrA.data(), sizeof(rocsparse_int) * (M + 1), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dcsr_col_indA, hcsr_col_indA.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_row_ptrA, + hcsr_row_ptrA.data(), + sizeof(rocsparse_int) * (M + 1), + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy( + dcsr_col_indA, hcsr_col_indA.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dcsr_valA, hcsr_valA.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(T) * Bnnz, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dC_1, hC_1.data(), sizeof(T) * Cnnz, hipMemcpyHostToDevice)); @@ -351,11 +517,43 @@ rocsparse_status testing_csrmm(Arguments argus) // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); - CHECK_ROCSPARSE_ERROR(rocsparse_csrmm(handle, transA, transB, Anrow, Cncol, Ancol, nnz, &h_alpha, descr, dcsr_valA, dcsr_row_ptrA, dcsr_col_indA, dB, ldb, &h_beta, dC_1, ldc)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrmm(handle, + transA, + transB, + Anrow, + Cncol, + Ancol, + nnz, + &h_alpha, + descr, + dcsr_valA, + dcsr_row_ptrA, + dcsr_col_indA, + dB, + ldb, + &h_beta, + dC_1, + ldc)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); - CHECK_ROCSPARSE_ERROR(rocsparse_csrmm(handle, transA, transB, Anrow, Cncol, Ancol, nnz, d_alpha, descr, dcsr_valA, dcsr_row_ptrA, dcsr_col_indA, dB, ldb, d_beta, dC_2, ldc)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrmm(handle, + transA, + transB, + Anrow, + Cncol, + Ancol, + nnz, + d_alpha, + descr, + dcsr_valA, + dcsr_row_ptrA, + dcsr_col_indA, + dB, + ldb, + d_beta, + dC_2, + ldc)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hC_1.data(), dC_1, sizeof(T) * Cnnz, hipMemcpyDeviceToHost)); @@ -369,11 +567,15 @@ rocsparse_status testing_csrmm(Arguments argus) for(rocsparse_int j = 0; j < Cncol; ++j) { rocsparse_int Cidx = i + j * ldc; - T sum = hC_gold[Cidx] * h_beta; + T sum = hC_gold[Cidx] * h_beta; - for(rocsparse_int k = hcsr_row_ptrA[i] - idx_base; k < hcsr_row_ptrA[i + 1] - idx_base; ++k) + for(rocsparse_int k = hcsr_row_ptrA[i] - idx_base; + k < hcsr_row_ptrA[i + 1] - idx_base; + ++k) { - rocsparse_int Bidx = (transB == rocsparse_operation_none) ? (hcsr_col_indA[k] - idx_base + j * ldb) : (j + (hcsr_col_indA[k] - idx_base) * ldb); + rocsparse_int Bidx = (transB == rocsparse_operation_none) + ? (hcsr_col_indA[k] - idx_base + j * ldb) + : (j + (hcsr_col_indA[k] - idx_base) * ldb); sum += h_alpha * hcsr_valA[k] * hB[Bidx]; } @@ -395,14 +597,46 @@ rocsparse_status testing_csrmm(Arguments argus) for(int iter = 0; iter < number_cold_calls; iter++) { - rocsparse_csrmm(handle, transA, transB, Anrow, Cncol, Ancol, nnz, &h_alpha, descr, dcsr_valA, dcsr_row_ptrA, dcsr_col_indA, dB, ldb, &h_beta, dC_1, ldc); + rocsparse_csrmm(handle, + transA, + transB, + Anrow, + Cncol, + Ancol, + nnz, + &h_alpha, + descr, + dcsr_valA, + dcsr_row_ptrA, + dcsr_col_indA, + dB, + ldb, + &h_beta, + dC_1, + ldc); } double gpu_time_used = get_time_us(); // in microseconds for(int iter = 0; iter < number_hot_calls; iter++) { - rocsparse_csrmm(handle, transA, transB, Anrow, Cncol, Ancol, nnz, &h_alpha, descr, dcsr_valA, dcsr_row_ptrA, dcsr_col_indA, dB, ldb, &h_beta, dC_1, ldc); + rocsparse_csrmm(handle, + transA, + transB, + Anrow, + Cncol, + Ancol, + nnz, + &h_alpha, + descr, + dcsr_valA, + dcsr_row_ptrA, + dcsr_col_indA, + dB, + ldb, + &h_beta, + dC_1, + ldc); } // Convert to miliseconds per call diff --git a/library/src/level3/csrmm_device.h b/library/src/level3/csrmm_device.h index 8e554857..f8ab2911 100644 --- a/library/src/level3/csrmm_device.h +++ b/library/src/level3/csrmm_device.h @@ -30,8 +30,8 @@ static __device__ void csrmmnn_general_device(rocsparse_int M, rocsparse_int colB = col * ldb; rocsparse_int colC = col * ldc; - __shared__ rocsparse_int shared_col[BLOCKSIZE/SUBWAVE_SIZE][SUBWAVE_SIZE]; - __shared__ T shared_val[BLOCKSIZE/SUBWAVE_SIZE][SUBWAVE_SIZE]; + __shared__ rocsparse_int shared_col[BLOCKSIZE / SUBWAVE_SIZE][SUBWAVE_SIZE]; + __shared__ T shared_val[BLOCKSIZE / SUBWAVE_SIZE][SUBWAVE_SIZE]; for(rocsparse_int row = warpid; row < M; row += nwarps) { @@ -47,7 +47,8 @@ static __device__ void csrmmnn_general_device(rocsparse_int M, __syncthreads(); shared_col[subid][laneid] = (k < row_end) ? __ldg(csr_col_ind + k) - idx_base : 0; - shared_val[subid][laneid] = (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); + shared_val[subid][laneid] = + (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); __syncthreads(); @@ -100,8 +101,8 @@ static __device__ void csrmmnt_general_device(rocsparse_int offset, return; } - __shared__ rocsparse_int shared_col[BLOCKSIZE/SUBWAVE_SIZE][SUBWAVE_SIZE]; - __shared__ T shared_val[BLOCKSIZE/SUBWAVE_SIZE][SUBWAVE_SIZE]; + __shared__ rocsparse_int shared_col[BLOCKSIZE / SUBWAVE_SIZE][SUBWAVE_SIZE]; + __shared__ T shared_val[BLOCKSIZE / SUBWAVE_SIZE][SUBWAVE_SIZE]; rocsparse_int row_start = __ldg(csr_row_ptr + row) - idx_base; rocsparse_int row_end = __ldg(csr_row_ptr + row + 1) - idx_base; @@ -109,7 +110,7 @@ static __device__ void csrmmnt_general_device(rocsparse_int offset, for(rocsparse_int l = offset; l < ncol; l += SUBWAVE_SIZE) { rocsparse_int col = l + laneid; - T sum = static_cast(0); + T sum = static_cast(0); for(rocsparse_int j = row_start; j < row_end; j += SUBWAVE_SIZE) { @@ -118,7 +119,8 @@ static __device__ void csrmmnt_general_device(rocsparse_int offset, __syncthreads(); shared_col[subid][laneid] = (k < row_end) ? N * (__ldg(csr_col_ind + k) - idx_base) : 0; - shared_val[subid][laneid] = (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); + shared_val[subid][laneid] = + (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); __syncthreads(); diff --git a/library/src/level3/rocsparse_csrmm.cpp b/library/src/level3/rocsparse_csrmm.cpp index 02d634bc..2a903983 100644 --- a/library/src/level3/rocsparse_csrmm.cpp +++ b/library/src/level3/rocsparse_csrmm.cpp @@ -29,8 +29,23 @@ extern "C" rocsparse_status rocsparse_scsrmm(rocsparse_handle handle, float* C, rocsparse_int ldc) { - return rocsparse_csrmm_template( - handle, trans_A, trans_B, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); + return rocsparse_csrmm_template(handle, + trans_A, + trans_B, + m, + n, + k, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + B, + ldb, + beta, + C, + ldc); } extern "C" rocsparse_status rocsparse_dcsrmm(rocsparse_handle handle, @@ -51,6 +66,21 @@ extern "C" rocsparse_status rocsparse_dcsrmm(rocsparse_handle handle, double* C, rocsparse_int ldc) { - return rocsparse_csrmm_template( - handle, trans_A, trans_B, m, n, k, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, B, ldb, beta, C, ldc); + return rocsparse_csrmm_template(handle, + trans_A, + trans_B, + m, + n, + k, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + B, + ldb, + beta, + C, + ldc); } diff --git a/library/src/level3/rocsparse_csrmm.hpp b/library/src/level3/rocsparse_csrmm.hpp index 51f7b7af..c37f45d3 100644 --- a/library/src/level3/rocsparse_csrmm.hpp +++ b/library/src/level3/rocsparse_csrmm.hpp @@ -14,97 +14,129 @@ #include template -__launch_bounds__(256) -__global__ void csrmmnn_kernel_host_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int k, - rocsparse_int nnz, - T alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ B, - rocsparse_int ldb, - T beta, - T* __restrict__ C, - rocsparse_int ldc, - rocsparse_index_base idx_base) +__launch_bounds__(256) __global__ + void csrmmnn_kernel_host_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, + rocsparse_int ldb, + T beta, + T* __restrict__ C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { - csrmmnn_general_device(m, n, k, nnz, alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, beta, C, ldc, idx_base); + csrmmnn_general_device( + m, n, k, nnz, alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, beta, C, ldc, idx_base); } template -__launch_bounds__(256) -__global__ void csrmmnn_kernel_device_pointer(rocsparse_int m, - rocsparse_int n, - rocsparse_int k, - rocsparse_int nnz, - const T* alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ B, - rocsparse_int ldb, - const T* beta, - T* __restrict__ C, - rocsparse_int ldc, - rocsparse_index_base idx_base) +__launch_bounds__(256) __global__ + void csrmmnn_kernel_device_pointer(rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, + rocsparse_int ldb, + const T* beta, + T* __restrict__ C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { if(*alpha == 0.0 && *beta == 1.0) { return; } - csrmmnn_general_device(m, n, k, nnz, *alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, *beta, C, ldc, idx_base); + csrmmnn_general_device( + m, n, k, nnz, *alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, *beta, C, ldc, idx_base); } template -__launch_bounds__(256) -__global__ void csrmmnt_kernel_host_pointer(rocsparse_int offset, - rocsparse_int ncol, - rocsparse_int m, - rocsparse_int n, - rocsparse_int k, - rocsparse_int nnz, - T alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ B, - rocsparse_int ldb, - T beta, - T* __restrict__ C, - rocsparse_int ldc, - rocsparse_index_base idx_base) +__launch_bounds__(256) __global__ + void csrmmnt_kernel_host_pointer(rocsparse_int offset, + rocsparse_int ncol, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, + rocsparse_int ldb, + T beta, + T* __restrict__ C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { - csrmmnt_general_device(offset, ncol, m, n, k, nnz, alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, beta, C, ldc, idx_base); + csrmmnt_general_device(offset, + ncol, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + idx_base); } template -__launch_bounds__(256) -__global__ void csrmmnt_kernel_device_pointer(rocsparse_int offset, - rocsparse_int ncol, - rocsparse_int m, - rocsparse_int n, - rocsparse_int k, - rocsparse_int nnz, - const T* alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ B, - rocsparse_int ldb, - const T* beta, - T* __restrict__ C, - rocsparse_int ldc, - rocsparse_index_base idx_base) +__launch_bounds__(256) __global__ + void csrmmnt_kernel_device_pointer(rocsparse_int offset, + rocsparse_int ncol, + rocsparse_int m, + rocsparse_int n, + rocsparse_int k, + rocsparse_int nnz, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, + rocsparse_int ldb, + const T* beta, + T* __restrict__ C, + rocsparse_int ldc, + rocsparse_index_base idx_base) { if(*alpha == 0.0 && *beta == 1.0) { return; } - csrmmnt_general_device(offset, ncol, m, n, k, nnz, *alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, *beta, C, ldc, idx_base); + csrmmnt_general_device(offset, + ncol, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + idx_base); } template @@ -299,7 +331,7 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, #define SUB_WF_SIZE 8 dim3 csrmmnn_blocks((SUB_WF_SIZE * m - 1) / CSRMMNN_DIM + 1, (n - 1) / SUB_WF_SIZE + 1); dim3 csrmmnn_threads(CSRMMNN_DIM); - + if(handle->pointer_mode == rocsparse_pointer_mode_device) { hipLaunchKernelGGL((csrmmnn_kernel_device_pointer), @@ -328,7 +360,7 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, { return rocsparse_status_success; } - + hipLaunchKernelGGL((csrmmnn_kernel_host_pointer), csrmmnn_blocks, csrmmnn_threads, @@ -366,15 +398,15 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, // power of two of the average row nnz of A. // Second step: Compute remainder, which is the remaining // columns of B. - rocsparse_int main = 0; + rocsparse_int main = 0; rocsparse_int remainder = 0; // Launch appropriate kernel depending on row nnz of A if(avg_row_nnz < 16) { remainder = n % 8; - main = n - remainder; - + main = n - remainder; + // Launch main kernel if enough columns of B if(main > 0) { @@ -404,8 +436,8 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, else if(avg_row_nnz < 32) { remainder = n % 16; - main = n - remainder; - + main = n - remainder; + // Launch main kernel if enough columns of B if(main > 0) { @@ -435,8 +467,8 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, else if(avg_row_nnz < 64 || handle->warp_size == 32) { remainder = n % 32; - main = n - remainder; - + main = n - remainder; + // Launch main kernel if enough columns of B if(main > 0) { @@ -466,8 +498,8 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, else if(handle->warp_size == 64) { remainder = n % 64; - main = n - remainder; - + main = n - remainder; + // Launch main kernel if enough columns of B if(main > 0) { @@ -612,15 +644,15 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, return rocsparse_status_success; } - rocsparse_int main = 0; + rocsparse_int main = 0; rocsparse_int remainder = 0; // Launch appropriate kernel if(avg_row_nnz < 16) { remainder = n % 8; - main = n - remainder; - + main = n - remainder; + // Launch main kernel if enough columns of B if(main > 0) { @@ -650,8 +682,8 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, else if(avg_row_nnz < 32) { remainder = n % 16; - main = n - remainder; - + main = n - remainder; + // Launch main kernel if enough columns of B if(main > 0) { @@ -681,8 +713,8 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, else if(avg_row_nnz < 64 || handle->warp_size == 32) { remainder = n % 32; - main = n - remainder; - + main = n - remainder; + // Launch main kernel if enough columns of B if(main > 0) { @@ -712,8 +744,8 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, else if(handle->warp_size == 64) { remainder = n % 64; - main = n - remainder; - + main = n - remainder; + // Launch main kernel if enough columns of B if(main > 0) { From 7431751572be99a1ad3b540555c063c573f716bc Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 3 Aug 2018 10:00:29 +0200 Subject: [PATCH 202/304] increasing version number --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb84607c..9393e149 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ list(APPEND CMAKE_MODULE_PATH include(cmake/SetToolchain.cmake) # rocSPARSE project -project(rocsparse VERSION 0.1.0.0 LANGUAGES CXX) +project(rocsparse VERSION 0.1.1.0 LANGUAGES CXX) set(rocsparse_SOVERSION 0) # Set a default build type if none was specified From 36a4a4695beaf8842451fc4f317a86528ede57a4 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 3 Aug 2018 10:18:47 +0200 Subject: [PATCH 203/304] ell2csr conversion added --- library/include/rocsparse-functions.h | 131 +++++++++++ library/src/CMakeLists.txt | 1 + library/src/conversion/ell2csr_device.h | 92 ++++++++ library/src/conversion/rocsparse_ell2csr.cpp | 219 +++++++++++++++++++ library/src/conversion/rocsparse_ell2csr.hpp | 140 ++++++++++++ 5 files changed, 583 insertions(+) create mode 100644 library/src/conversion/ell2csr_device.h create mode 100644 library/src/conversion/rocsparse_ell2csr.cpp create mode 100644 library/src/conversion/rocsparse_ell2csr.hpp diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 05b114b9..a8a769db 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -1305,6 +1305,137 @@ rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, rocsparse_int* csr_row_ptr, rocsparse_index_base idx_base); +/*! \brief SPARSE Format Conversions API + + \details + ell2csr_nnz computes the total CSR non-zero elements and the CSR + pointer to the start of every row for a given ELL matrix. It is + assumed, csr_row_ptr has been allocated with size m + 1. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + n number of columns of A. + @param[in] + ell_descr descriptor of the ELL matrix. + @param[in] + ell_width number of non-zero elements per row in ELL storage format. + @param[in] + ell_col_ind array of nnz elements containing the column indices of A. + Padded column indices should be set to -1. + @param[in] + csr_descr descriptor of the CSR matrix. + @param[out] + csr_row_ptr array of m+1 elements that point to the start of every + row of A. + @param[out] + csr_nnz pointer to the total number of non-zero elements in CSR + storage format. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + rocsparse_int* csr_row_ptr, + rocsparse_int* csr_nnz); + +/*! \brief SPARSE Format Conversions API + + \details + ell2csr converts an ELL matrix into a CSR matrix. It is assumed, that + csr_val and csr_col_ind are allocated. Allocation size is defined by + the number of total CSR non-zero elements and can be obtained by + calling ell2csr_nnz routine. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[in] + m number of rows of A. + @param[in] + n number of colums of A. + @param[in] + ell_descr descriptor of the ELL matrix. + @param[in] + ell_width number of non-zero elements per row in ELL storage format. + @param[in] + ell_val array of nnz elements of A. Padded elements should be set + to 0. + @param[in] + ell_col_ind array of nnz elements containing the column indices of A. + Padded column indices should be set to -1. + @param[in] + csr_descr descriptor of the CSR matrix. + @param[out] + csr_val array of nnz elements of A. + @param[in] + csr_row_ptr array of m+1 elements that point to the start + of every row of A. + @param[out] + csr_col_ind array of nnz elements containing the column indices of A. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_sell2csr(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const float* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + float* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dell2csr(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const double* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + double* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind); +/* +ROCSPARSE_EXPORT +rocsparse_status rocsparse_cell2csr(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const rocsparse_float_complex* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + rocsparse_float_complex* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_zell2csr(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const rocsparse_double_complex* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + rocsparse_double_complex* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind); +*/ + /*! \brief SPARSE Format Conversions API \details diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 170877e4..4d235e16 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -32,6 +32,7 @@ set(rocsparse_source src/conversion/rocsparse_csr2ell.cpp src/conversion/rocsparse_csr2hyb.cpp src/conversion/rocsparse_coo2csr.cpp + src/conversion/rocsparse_ell2csr.cpp src/conversion/rocsparse_identity.cpp src/conversion/rocsparse_csrsort.cpp src/conversion/rocsparse_coosort.cpp diff --git a/library/src/conversion/ell2csr_device.h b/library/src/conversion/ell2csr_device.h new file mode 100644 index 00000000..d50efbe4 --- /dev/null +++ b/library/src/conversion/ell2csr_device.h @@ -0,0 +1,92 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ELL2CSR_DEVICE_H +#define ELL2CSR_DEVICE_H + +#include "handle.h" + +#include + +__global__ void ell2csr_nnz_per_row(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + const rocsparse_int* __restrict__ ell_col_ind, + rocsparse_index_base ell_base, + rocsparse_int* __restrict__ csr_row_ptr, + rocsparse_index_base csr_base) +{ + rocsparse_int ai = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + + if(ai >= m) + { + return; + } + + if(ai == 0) + { + csr_row_ptr[0] = csr_base; + } + + rocsparse_int nnz = 0; + + for(rocsparse_int p = 0; p < ell_width; ++p) + { + rocsparse_int idx = ELL_IND(ai, p, m, ell_width); + rocsparse_int col = ell_col_ind[idx] - ell_base; + + if(col >= 0 && col < n) + { + ++nnz; + } + else + { + break; + } + } + + csr_row_ptr[ai + 1] = nnz; +} + +template +__global__ void ell2csr_fill(rocsparse_int m, + rocsparse_int n, + rocsparse_int ell_width, + const rocsparse_int* __restrict__ ell_col_ind, + const T* __restrict__ ell_val, + rocsparse_index_base ell_base, + const rocsparse_int* __restrict__ csr_row_ptr, + rocsparse_int* __restrict__ csr_col_ind, + T* __restrict__ csr_val, + rocsparse_index_base csr_base) +{ + rocsparse_int ai = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + + if(ai >= m) + { + return; + } + + rocsparse_int csr_idx = csr_row_ptr[ai] - csr_base; + + for(rocsparse_int p = 0; p < ell_width; ++p) + { + rocsparse_int ell_idx = ELL_IND(ai, p, m, ell_width); + rocsparse_int ell_col = ell_col_ind[ell_idx] - ell_base; + + if(ell_col >= 0 && ell_col < n) + { + csr_col_ind[csr_idx] = ell_col + csr_base; + csr_val[csr_idx] = ell_val[ell_idx]; + ++csr_idx; + } + else + { + break; + } + } +} + +#endif // ELL2CSR_DEVICE_H diff --git a/library/src/conversion/rocsparse_ell2csr.cpp b/library/src/conversion/rocsparse_ell2csr.cpp new file mode 100644 index 00000000..b92b00c4 --- /dev/null +++ b/library/src/conversion/rocsparse_ell2csr.cpp @@ -0,0 +1,219 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "definitions.h" +#include "utility.h" +#include "rocsparse.h" +#include "ell2csr_device.h" +#include "rocsparse_ell2csr.hpp" + +#include +#include + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + rocsparse_int* csr_row_ptr, + rocsparse_int* csr_nnz) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(ell_descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_ell2csr_nnz", + m, + n, + (const void*&)ell_descr, + ell_width, + (const void*&)ell_col_ind, + (const void*&)csr_descr, + (const void*&)csr_row_ptr, + (const void*&)csr_nnz); + + // Check index base + if(ell_descr->base != rocsparse_index_base_zero && ell_descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(csr_descr->base != rocsparse_index_base_zero && csr_descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check matrix type + if(ell_descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + if(csr_descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0 || n < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(ell_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_nnz == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0) + { + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + RETURN_IF_HIP_ERROR(hipMemset(csr_nnz, 0, sizeof(rocsparse_int))); + } + else + { + *csr_nnz = 0; + } + return rocsparse_status_success; + } + + hipStream_t stream = handle->stream; + +// Count nnz per row +#define ELL2CSR_DIM 256 + dim3 ell2csr_blocks((m + 1) / ELL2CSR_DIM + 1); + dim3 ell2csr_threads(ELL2CSR_DIM); + + hipLaunchKernelGGL((ell2csr_nnz_per_row), + ell2csr_blocks, + ell2csr_threads, + 0, + stream, + m, + n, + ell_width, + ell_col_ind, + ell_descr->base, + csr_row_ptr, + csr_descr->base); +#undef ELL2CSR_DIM + + // Exclusive sum to obtain csr_row_ptr array and number of non-zero elements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + + // Obtain hipcub buffer size + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum( + d_temp_storage, temp_storage_bytes, csr_row_ptr, csr_row_ptr, m + 1)); + + // Allocate hipcub buffer + RETURN_IF_HIP_ERROR(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Perform actual inclusive sum + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum( + d_temp_storage, temp_storage_bytes, csr_row_ptr, csr_row_ptr, m + 1)); + + // Free hipcub buffer + RETURN_IF_HIP_ERROR(hipFree(d_temp_storage)); + + // Extract and adjust nnz according to index base + rocsparse_int nnz; + RETURN_IF_HIP_ERROR( + hipMemcpy(&nnz, csr_row_ptr + m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + nnz -= csr_descr->base; + + // Set nnz + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + RETURN_IF_HIP_ERROR(hipMemcpy(csr_nnz, &nnz, sizeof(rocsparse_int), hipMemcpyHostToDevice)); + } + else + { + *csr_nnz = nnz; + } + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_sell2csr(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const float* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + float* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind) +{ + return rocsparse_ell2csr_template(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind); +} + +extern "C" rocsparse_status rocsparse_dell2csr(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const double* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + double* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind) +{ + return rocsparse_ell2csr_template(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind); +} diff --git a/library/src/conversion/rocsparse_ell2csr.hpp b/library/src/conversion/rocsparse_ell2csr.hpp new file mode 100644 index 00000000..61fc4e8a --- /dev/null +++ b/library/src/conversion/rocsparse_ell2csr.hpp @@ -0,0 +1,140 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_ELL2CSR_HPP +#define ROCSPARSE_ELL2CSR_HPP + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "ell2csr_device.h" + +#include + +template +rocsparse_status rocsparse_ell2csr_template(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const T* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + T* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(ell_descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + log_trace(handle, + replaceX("rocsparse_Xell2csr"), + m, + n, + (const void*&)ell_descr, + ell_width, + (const void*&)ell_val, + (const void*&)ell_col_ind, + (const void*&)csr_descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind); + + // Check index base + if(ell_descr->base != rocsparse_index_base_zero && ell_descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(csr_descr->base != rocsparse_index_base_zero && csr_descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check matrix type + if(ell_descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + if(csr_descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0 || n < 0 || ell_width < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(ell_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(ell_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || ell_width == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + +#define ELL2CSR_DIM 256 + dim3 ell2csr_blocks((m - 1) / ELL2CSR_DIM + 1); + dim3 ell2csr_threads(ELL2CSR_DIM); + + hipLaunchKernelGGL((ell2csr_fill), + ell2csr_blocks, + ell2csr_threads, + 0, + stream, + m, + n, + ell_width, + ell_col_ind, + ell_val, + ell_descr->base, + csr_row_ptr, + csr_col_ind, + csr_val, + csr_descr->base); +#undef ELL2CSR_DIM + return rocsparse_status_success; +} + +#endif // ROCSPARSE_ELL2CSR_HPP From c74a53c34b0cc25c1c58ccbc6517047c28a5ad88 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 3 Aug 2018 10:19:12 +0200 Subject: [PATCH 204/304] ell2csr tests and benchmarks added --- clients/benchmarks/client.cpp | 10 +- .../rocsparse_template_specialization.cpp | 52 ++ clients/include/rocsparse.hpp | 16 +- clients/include/testing_ell2csr.hpp | 712 ++++++++++++++++++ clients/tests/CMakeLists.txt | 1 + clients/tests/test_ell2csr.cpp | 142 ++++ 6 files changed, 931 insertions(+), 2 deletions(-) create mode 100644 clients/include/testing_ell2csr.hpp create mode 100644 clients/tests/test_ell2csr.cpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index df1319e0..56d89d0b 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -25,6 +25,7 @@ #include "testing_csr2ell.hpp" #include "testing_csr2hyb.hpp" #include "testing_coo2csr.hpp" +#include "testing_ell2csr.hpp" #include "testing_identity.hpp" #include "testing_csrsort.hpp" #include "testing_coosort.hpp" @@ -86,7 +87,7 @@ int main(int argc, char* argv[]) " Level1: axpyi, doti, gthr, gthrz, roti, sctr\n" " Level2: coomv, csrmv, ellmv, hybmv\n" " Conversion: csr2coo, csr2csc, csr2ell,\n" - " csr2hyb, coo2csr\n" + " csr2hyb, coo2csr, ell2csr\n" " Sorting: csrsort, coosort") ("precision,r", @@ -242,6 +243,13 @@ int main(int argc, char* argv[]) { testing_coo2csr(argus); } + else if(function == "ell2csr") + { + if(precision == 's') + testing_ell2csr(argus); + else if(precision == 'd') + testing_ell2csr(argus); + } else if(function == "csrsort") { testing_csrsort(argus); diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index b1a85cc7..d65e242c 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -518,4 +518,56 @@ rocsparse_status rocsparse_csr2hyb(rocsparse_handle handle, partition_type); } +template <> +rocsparse_status rocsparse_ell2csr(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const float* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + float* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind) +{ + return rocsparse_sell2csr(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind); +} + +template <> +rocsparse_status rocsparse_ell2csr(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const double* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + double* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind) +{ + return rocsparse_dell2csr(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind); +} + } // namespace rocsparse diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index 316d42eb..ccdb1714 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -172,6 +172,20 @@ rocsparse_status rocsparse_csr2hyb(rocsparse_handle handle, rocsparse_hyb_mat hyb, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type); -} + +template +rocsparse_status rocsparse_ell2csr(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int n, + const rocsparse_mat_descr ell_descr, + rocsparse_int ell_width, + const T* ell_val, + const rocsparse_int* ell_col_ind, + const rocsparse_mat_descr csr_descr, + T* csr_val, + const rocsparse_int* csr_row_ptr, + rocsparse_int* csr_col_ind); + +} // namespace rocsparse #endif // _ROCSPARSE_HPP_ diff --git a/clients/include/testing_ell2csr.hpp b/clients/include/testing_ell2csr.hpp new file mode 100644 index 00000000..f3e04b83 --- /dev/null +++ b/clients/include/testing_ell2csr.hpp @@ -0,0 +1,712 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_ELL2CSR_HPP +#define TESTING_ELL2CSR_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +#define ELL_IND_ROW(i, el, m, width) (el) * (m) + (i) +#define ELL_IND_EL(i, el, m, width) (el) + (width) * (i) +#define ELL_IND(i, el, m, width) ELL_IND_ROW(i, el, m, width) + +template +void testing_ell2csr_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int n = 100; + rocsparse_int ell_width = 100; + rocsparse_int safe_size = 100; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_csr_descr(new descr_struct); + rocsparse_mat_descr csr_descr = unique_ptr_csr_descr->descr; + + std::unique_ptr unique_ptr_ell_descr(new descr_struct); + rocsparse_mat_descr ell_descr = unique_ptr_ell_descr->descr; + + auto ell_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + + rocsparse_int* ell_col_ind = (rocsparse_int*)ell_col_ind_managed.get(); + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + + if(!ell_col_ind || !csr_row_ptr) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // ELL to CSR conversion is a two step process - test both functions for bad arguments + + // Step 1: Determine number of non-zero elements of CSR storage format + rocsparse_int csr_nnz; + + // Testing for (ell_col_ind == nullptr) + { + rocsparse_int* ell_col_ind_null = nullptr; + + status = rocsparse_ell2csr_nnz( + handle, m, n, ell_descr, ell_width, ell_col_ind_null, csr_descr, csr_row_ptr, &csr_nnz); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_col_ind is nullptr"); + } + + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_ell2csr_nnz( + handle, m, n, ell_descr, ell_width, ell_col_ind, csr_descr, csr_row_ptr_null, &csr_nnz); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_width is nullptr"); + } + + // Testing for (csr_nnz == nullptr) + { + rocsparse_int* csr_nnz_null = nullptr; + + status = rocsparse_ell2csr_nnz( + handle, m, n, ell_descr, ell_width, ell_col_ind, csr_descr, csr_row_ptr, csr_nnz_null); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_nnz is nullptr"); + } + + // Testing for (ell_descr == nullptr) + { + rocsparse_mat_descr ell_descr_null = nullptr; + + status = rocsparse_ell2csr_nnz( + handle, m, n, ell_descr_null, ell_width, ell_col_ind, csr_descr, csr_row_ptr, &csr_nnz); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_descr is nullptr"); + } + + // Testing for (csr_descr == nullptr) + { + rocsparse_mat_descr csr_descr_null = nullptr; + + status = rocsparse_ell2csr_nnz( + handle, m, n, ell_descr, ell_width, ell_col_ind, csr_descr_null, csr_row_ptr, &csr_nnz); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_descr is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_ell2csr_nnz( + handle_null, m, n, ell_descr, ell_width, ell_col_ind, csr_descr, csr_row_ptr, &csr_nnz); + verify_rocsparse_status_invalid_handle(status); + } + + // Allocate memory for ELL storage format + auto ell_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + T* ell_val = (T*)ell_val_managed.get(); + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); + + if(!ell_val || !csr_col_ind || !csr_val) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // Step 2: Perform the actual conversion + + // Set ell_width to some valid value, to avoid invalid_size status + ell_width = 10; + + // Testing for (ell_col_ind == nullptr) + { + rocsparse_int* ell_col_ind_null = nullptr; + + status = rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind_null, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_col_ind is nullptr"); + } + + // Testing for (ell_val == nullptr) + { + T* ell_val_null = nullptr; + + status = rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + ell_val_null, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_val is nullptr"); + } + + // Testing for (csr_row_ptr == nullptr) + { + rocsparse_int* csr_row_ptr_null = nullptr; + + status = rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr_null, + csr_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_row_ptr is nullptr"); + } + + // Testing for (csr_col_ind == nullptr) + { + rocsparse_int* csr_col_ind_null = nullptr; + + status = rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind_null); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_col_ind is nullptr"); + } + + // Testing for (csr_val == nullptr) + { + T* csr_val_null = nullptr; + + status = rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val_null, + csr_row_ptr, + csr_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_val is nullptr"); + } + + // Testing for (ell_descr == nullptr) + { + rocsparse_mat_descr ell_descr_null = nullptr; + + status = rocsparse_ell2csr(handle, + m, + n, + ell_descr_null, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: ell_descr is nullptr"); + } + + // Testing for (csr_descr == nullptr) + { + rocsparse_mat_descr csr_descr_null = nullptr; + + status = rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr_null, + csr_val, + csr_row_ptr, + csr_col_ind); + verify_rocsparse_status_invalid_pointer(status, "Error: csr_descr is nullptr"); + } + + // Testing for (handle == nullptr) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_ell2csr(handle_null, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_ell2csr(Arguments argus) +{ + rocsparse_int m = argus.M; + rocsparse_int n = argus.N; + rocsparse_int safe_size = 100; + rocsparse_index_base ell_base = argus.idx_base; + rocsparse_index_base csr_base = argus.idx_base2; + std::string binfile = ""; + std::string filename = ""; + rocsparse_status status; + + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && n == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = n = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + + double scale = 0.02; + if(m > 1000 || n > 1000) + { + scale = 2.0 / std::max(m, n); + } + rocsparse_int nnz = m * scale * n; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_ell_descr(new descr_struct); + rocsparse_mat_descr ell_descr = unique_ptr_ell_descr->descr; + + // Set ELL matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(ell_descr, ell_base)); + + std::unique_ptr unique_ptr_csr_descr(new descr_struct); + rocsparse_mat_descr csr_descr = unique_ptr_csr_descr->descr; + + // Set CSR matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(csr_descr, csr_base)); + + // Argument sanity check before allocating invalid memory + if(m <= 0 || n <= 0 || nnz <= 0) + { + auto ell_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto ell_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto csr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + + rocsparse_int* ell_col_ind = (rocsparse_int*)ell_col_ind_managed.get(); + T* ell_val = (T*)ell_val_managed.get(); + rocsparse_int* csr_row_ptr = (rocsparse_int*)csr_row_ptr_managed.get(); + + if(!ell_col_ind || !ell_val || !csr_row_ptr) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!ell_col_ind || !ell_val || !csr_row_ptr"); + return rocsparse_status_memory_error; + } + + rocsparse_int ell_width = safe_size; + + // Step 1 - obtain CSR nnz + rocsparse_int csr_nnz; + status = rocsparse_ell2csr_nnz( + handle, m, n, ell_descr, ell_width, ell_col_ind, csr_descr, csr_row_ptr, &csr_nnz); + + if(m < 0 || n < 0 || ell_width < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || ell_width < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && ell_width >= 0"); + } + + // Step 2 - perform actual conversion + auto csr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto csr_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + + rocsparse_int* csr_col_ind = (rocsparse_int*)csr_col_ind_managed.get(); + T* csr_val = (T*)csr_val_managed.get(); + + if(!csr_col_ind || !csr_val) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!csr_col_ind || !csr_val"); + return rocsparse_status_memory_error; + } + + status = rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + ell_val, + ell_col_ind, + csr_descr, + csr_val, + csr_row_ptr, + csr_col_ind); + + if(m < 0 || n < 0 || ell_width < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || ell_width < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && ell_width >= 0"); + } + + return rocsparse_status_success; + } + + // For testing, assemble a CSR matrix + + // Host structures + std::vector hcsr_row_ptr_gold; + std::vector hcsr_col_ind_gold; + std::vector hcsr_val_gold; + + // Sample initial CSR matrix on CPU + srand(12345ULL); + if(binfile != "") + { + if(read_bin_matrix(binfile.c_str(), + m, + n, + nnz, + hcsr_row_ptr_gold, + hcsr_col_ind_gold, + hcsr_val_gold, + csr_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) + { + m = n = gen_2d_laplacian( + argus.laplacian, hcsr_row_ptr_gold, hcsr_col_ind_gold, hcsr_val_gold, csr_base); + nnz = hcsr_row_ptr_gold[m]; + } + else + { + std::vector hcoo_row_ind; + + if(filename != "") + { + if(read_mtx_matrix(filename.c_str(), + m, + n, + nnz, + hcoo_row_ind, + hcsr_col_ind_gold, + hcsr_val_gold, + csr_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind_gold, hcsr_val_gold, csr_base); + } + + // Convert COO to CSR + hcsr_row_ptr_gold.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr_gold[hcoo_row_ind[i] + 1 - csr_base]; + } + + hcsr_row_ptr_gold[0] = csr_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr_gold[i + 1] += hcsr_row_ptr_gold[i]; + } + } + + rocsparse_int csr_nnz_gold = nnz; + + // Allocate memory on the device + auto dcsr_row_ptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcsr_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dcsr_val_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + + rocsparse_int* dcsr_row_ptr = (rocsparse_int*)dcsr_row_ptr_managed.get(); + rocsparse_int* dcsr_col_ind = (rocsparse_int*)dcsr_col_ind_managed.get(); + T* dcsr_val = (T*)dcsr_val_managed.get(); + + if(!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dcsr_row_ptr || !dcsr_col_ind || !dcsr_val"); + return rocsparse_status_memory_error; + } + + // Copy data from host to device + CHECK_HIP_ERROR(hipMemcpy(dcsr_row_ptr, + hcsr_row_ptr_gold.data(), + sizeof(rocsparse_int) * (m + 1), + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dcsr_col_ind, + hcsr_col_ind_gold.data(), + sizeof(rocsparse_int) * nnz, + hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dcsr_val, hcsr_val_gold.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + + // Convert CSR matrix to ELL format on GPU + rocsparse_int ell_width; + + CHECK_ROCSPARSE_ERROR( + rocsparse_csr2ell_width(handle, m, csr_descr, dcsr_row_ptr, ell_descr, &ell_width)); + + auto dell_col_ind_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (ell_width * m)), device_free}; + auto dell_val_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * (ell_width * m)), device_free}; + + rocsparse_int* dell_col_ind = (rocsparse_int*)dell_col_ind_managed.get(); + T* dell_val = (T*)dell_val_managed.get(); + + if(!dell_col_ind || !dell_val) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dell_col_ind || !dell_val"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_csr2ell(handle, + m, + csr_descr, + dcsr_val, + dcsr_row_ptr, + dcsr_col_ind, + ell_descr, + ell_width, + dell_val, + dell_col_ind)); + + if(argus.unit_check) + { + // Determine csr non-zero entries + rocsparse_int csr_nnz; + + auto dcsr_row_ptr_conv_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + + rocsparse_int* dcsr_row_ptr_conv = (rocsparse_int*)dcsr_row_ptr_conv_managed.get(); + + if(!dcsr_row_ptr_conv) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dcsr_row_ptr_conv"); + return rocsparse_status_memory_error; + } + + CHECK_ROCSPARSE_ERROR(rocsparse_ell2csr_nnz(handle, + m, + n, + ell_descr, + ell_width, + dell_col_ind, + csr_descr, + dcsr_row_ptr_conv, + &csr_nnz)); + + // Check if CSR nnz does match + unit_check_general(1, 1, 1, &csr_nnz_gold, &csr_nnz); + + // Allocate CSR column and values arrays + auto dcsr_col_ind_conv_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * csr_nnz), device_free}; + auto dcsr_val_conv_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * csr_nnz), device_free}; + + rocsparse_int* dcsr_col_ind_conv = (rocsparse_int*)dcsr_col_ind_conv_managed.get(); + T* dcsr_val_conv = (T*)dcsr_val_conv_managed.get(); + + if(!dcsr_col_ind_conv || !dcsr_val_conv) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dcsr_col_ind_conv || !dcsr_val_conv"); + return rocsparse_status_memory_error; + } + + // Perform actual CSR conversion + CHECK_ROCSPARSE_ERROR(rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + dell_val, + dell_col_ind, + csr_descr, + dcsr_val_conv, + dcsr_row_ptr_conv, + dcsr_col_ind_conv)); + + // Verification host structures + std::vector hcsr_row_ptr(m + 1); + std::vector hcsr_col_ind(csr_nnz); + std::vector hcsr_val(csr_nnz); + + CHECK_HIP_ERROR(hipMemcpy(hcsr_row_ptr.data(), + dcsr_row_ptr_conv, + sizeof(rocsparse_int) * (m + 1), + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hcsr_col_ind.data(), + dcsr_col_ind_conv, + sizeof(rocsparse_int) * csr_nnz, + hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(hcsr_val.data(), dcsr_val_conv, sizeof(T) * csr_nnz, hipMemcpyDeviceToHost)); + + // Unit check + unit_check_general(1, m + 1, 1, hcsr_row_ptr_gold.data(), hcsr_row_ptr.data()); + unit_check_general(1, csr_nnz, 1, hcsr_col_ind_gold.data(), hcsr_col_ind.data()); + unit_check_general(1, csr_nnz, 1, hcsr_val_gold.data(), hcsr_val.data()); + } + + if(argus.timing) + { + rocsparse_int number_cold_calls = 2; + rocsparse_int number_hot_calls = argus.iters; + + for(rocsparse_int iter = 0; iter < number_cold_calls; ++iter) + { + auto dcsr_row_ptr_conv_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + + rocsparse_int* dcsr_row_ptr_conv = (rocsparse_int*)dcsr_row_ptr_conv_managed.get(); + + rocsparse_int csr_nnz; + rocsparse_ell2csr_nnz(handle, + m, + n, + ell_descr, + ell_width, + dell_col_ind, + csr_descr, + dcsr_row_ptr_conv, + &csr_nnz); + + auto dcsr_col_ind_conv_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * csr_nnz), device_free}; + auto dcsr_val_conv_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * csr_nnz), device_free}; + + rocsparse_int* dcsr_col_ind_conv = (rocsparse_int*)dcsr_col_ind_conv_managed.get(); + T* dcsr_val_conv = (T*)dcsr_val_conv_managed.get(); + + rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + dell_val, + dell_col_ind, + csr_descr, + dcsr_val_conv, + dcsr_row_ptr_conv, + dcsr_col_ind_conv); + } + + double gpu_time_used = get_time_us(); + + for(rocsparse_int iter = 0; iter < number_hot_calls; ++iter) + { + auto dcsr_row_ptr_conv_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + + rocsparse_int* dcsr_row_ptr_conv = (rocsparse_int*)dcsr_row_ptr_conv_managed.get(); + + rocsparse_int csr_nnz; + rocsparse_ell2csr_nnz(handle, + m, + n, + ell_descr, + ell_width, + dell_col_ind, + csr_descr, + dcsr_row_ptr_conv, + &csr_nnz); + + auto dcsr_col_ind_conv_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * csr_nnz), device_free}; + auto dcsr_val_conv_managed = + rocsparse_unique_ptr{device_malloc(sizeof(T) * csr_nnz), device_free}; + + rocsparse_int* dcsr_col_ind_conv = (rocsparse_int*)dcsr_col_ind_conv_managed.get(); + T* dcsr_val_conv = (T*)dcsr_val_conv_managed.get(); + + rocsparse_ell2csr(handle, + m, + n, + ell_descr, + ell_width, + dell_val, + dell_col_ind, + csr_descr, + dcsr_val_conv, + dcsr_row_ptr_conv, + dcsr_col_ind_conv); + } + + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + + printf("m\t\tn\t\tnnz\t\tmsec\n"); + printf("%8d\t%8d\t%9d\t%0.2lf\n", m, n, nnz, gpu_time_used); + } + + return rocsparse_status_success; +} + +#endif // TESTING_ELL2CSR_HPP diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 6cd0dcc2..5c417026 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -62,6 +62,7 @@ set(ROCSPARSE_TEST_SOURCES test_csr2ell.cpp test_csr2hyb.cpp test_coo2csr.cpp + test_ell2csr.cpp test_identity.cpp test_csrsort.cpp test_coosort.cpp diff --git a/clients/tests/test_ell2csr.cpp b/clients/tests/test_ell2csr.cpp new file mode 100644 index 00000000..12cdadd3 --- /dev/null +++ b/clients/tests/test_ell2csr.cpp @@ -0,0 +1,142 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_ell2csr.hpp" +#include "utility.hpp" + +#include +#include +#include +#include + +typedef std::tuple ell2csr_tuple; +typedef std::tuple ell2csr_bin_tuple; + +int ell2csr_M_range[] = {-1, 0, 10, 500, 872, 1000}; +int ell2csr_N_range[] = {-3, 0, 33, 242, 623, 1000}; + +rocsparse_index_base ell2csr_ell_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; +rocsparse_index_base ell2csr_csr_base_range[] = {rocsparse_index_base_zero, + rocsparse_index_base_one}; + +std::string ell2csr_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "bibd_22_8.bin", + "mc2depi.bin", + "scircuit.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + +class parameterized_ell2csr : public testing::TestWithParam +{ + protected: + parameterized_ell2csr() {} + virtual ~parameterized_ell2csr() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +class parameterized_ell2csr_bin : public testing::TestWithParam +{ + protected: + parameterized_ell2csr_bin() {} + virtual ~parameterized_ell2csr_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_ell2csr_arguments(ell2csr_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.N = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.idx_base2 = std::get<3>(tup); + arg.timing = 0; + return arg; +} + +Arguments setup_ell2csr_arguments(ell2csr_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.N = -99; + arg.idx_base = std::get<0>(tup); + arg.idx_base2 = std::get<1>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<2>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + +TEST(ell2csr_bad_arg, ell2csr) { testing_ell2csr_bad_arg(); } + +TEST_P(parameterized_ell2csr, ell2csr_float) +{ + Arguments arg = setup_ell2csr_arguments(GetParam()); + + rocsparse_status status = testing_ell2csr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_ell2csr, ell2csr_double) +{ + Arguments arg = setup_ell2csr_arguments(GetParam()); + + rocsparse_status status = testing_ell2csr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_ell2csr_bin, ell2csr_bin_float) +{ + Arguments arg = setup_ell2csr_arguments(GetParam()); + + rocsparse_status status = testing_ell2csr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_ell2csr_bin, ell2csr_bin_double) +{ + Arguments arg = setup_ell2csr_arguments(GetParam()); + + rocsparse_status status = testing_ell2csr(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(ell2csr, + parameterized_ell2csr, + testing::Combine(testing::ValuesIn(ell2csr_M_range), + testing::ValuesIn(ell2csr_N_range), + testing::ValuesIn(ell2csr_ell_base_range), + testing::ValuesIn(ell2csr_csr_base_range))); + +INSTANTIATE_TEST_CASE_P(ell2csr_bin, + parameterized_ell2csr_bin, + testing::Combine(testing::ValuesIn(ell2csr_ell_base_range), + testing::ValuesIn(ell2csr_csr_base_range), + testing::ValuesIn(ell2csr_bin))); From 69b3964b1d4cf17df4488c4e0a348225f3d3c0ba Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 13 Aug 2018 15:02:23 +0200 Subject: [PATCH 205/304] additional check in ell2csr ; typo in API header --- library/include/rocsparse-functions.h | 2 +- library/src/conversion/rocsparse_ell2csr.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index a8a769db..056916ef 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -1587,7 +1587,7 @@ rocsparse_status rocsparse_coosort_buffer_size(rocsparse_handle handle, n number of columns of A. @param[in] nnz number of non-zero elements of A. - @param[in] + @param[inout] coo_row_ind array of nnz elements containing the row indices of A. @param[inout] diff --git a/library/src/conversion/rocsparse_ell2csr.cpp b/library/src/conversion/rocsparse_ell2csr.cpp index b92b00c4..f45c50f9 100644 --- a/library/src/conversion/rocsparse_ell2csr.cpp +++ b/library/src/conversion/rocsparse_ell2csr.cpp @@ -76,7 +76,7 @@ extern "C" rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, } // Check sizes - if(m < 0 || n < 0) + if(m < 0 || n < 0 || ell_width < 0) { return rocsparse_status_invalid_size; } @@ -96,7 +96,7 @@ extern "C" rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, } // Quick return if possible - if(m == 0 || n == 0) + if(m == 0 || n == 0 || ell_width == 0) { if(handle->pointer_mode == rocsparse_pointer_mode_device) { From 70d9aeabe892f67e03725332a88f0d3dcd8aa67f Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 13 Aug 2018 16:47:12 +0200 Subject: [PATCH 206/304] comment added --- library/src/level2/rocsparse_coomv.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/library/src/level2/rocsparse_coomv.hpp b/library/src/level2/rocsparse_coomv.hpp index 980579d4..10b25396 100644 --- a/library/src/level2/rocsparse_coomv.hpp +++ b/library/src/level2/rocsparse_coomv.hpp @@ -213,6 +213,7 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, rocsparse_int* row_block_red = NULL; T* val_block_red = NULL; + // Allocating a maximum of 8 kByte RETURN_IF_HIP_ERROR(hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwarps)); RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwarps)); From 89b5f7eac910fdc9f0e0f49b381b6d2aa4eca446 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Mon, 13 Aug 2018 17:18:58 +0200 Subject: [PATCH 207/304] log_bench added where it made sense --- clients/benchmarks/client.cpp | 7 ++++++- library/src/conversion/rocsparse_coo2csr.cpp | 6 +++++- library/src/conversion/rocsparse_coosort.cpp | 8 ++++++-- library/src/conversion/rocsparse_csr2coo.cpp | 4 ++++ library/src/conversion/rocsparse_csr2csc.cpp | 2 +- library/src/conversion/rocsparse_csr2csc.hpp | 7 ++++++- library/src/conversion/rocsparse_csr2ell.cpp | 2 +- library/src/conversion/rocsparse_csr2ell.hpp | 7 ++++++- library/src/conversion/rocsparse_csr2hyb.hpp | 7 ++++++- library/src/conversion/rocsparse_csrsort.cpp | 10 +++++++--- library/src/conversion/rocsparse_ell2csr.cpp | 2 +- library/src/conversion/rocsparse_ell2csr.hpp | 7 ++++++- library/src/conversion/rocsparse_identity.cpp | 7 ++++++- library/src/level1/rocsparse_axpyi.hpp | 9 ++++++++- library/src/level1/rocsparse_doti.hpp | 7 ++++++- library/src/level1/rocsparse_gthr.hpp | 7 ++++++- library/src/level1/rocsparse_gthrz.hpp | 7 ++++++- library/src/level1/rocsparse_sctr.hpp | 7 ++++++- library/src/level2/rocsparse_coomv.hpp | 11 ++++++++++- library/src/level2/rocsparse_csrmv.hpp | 11 ++++++++++- library/src/level2/rocsparse_ellmv.hpp | 11 ++++++++++- library/src/level2/rocsparse_hybmv.hpp | 11 ++++++++++- 22 files changed, 133 insertions(+), 24 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 56d89d0b..cb06faae 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -88,7 +88,8 @@ int main(int argc, char* argv[]) " Level2: coomv, csrmv, ellmv, hybmv\n" " Conversion: csr2coo, csr2csc, csr2ell,\n" " csr2hyb, coo2csr, ell2csr\n" - " Sorting: csrsort, coosort") + " Sorting: csrsort, coosort\n" + " Misc: identity") ("precision,r", po::value(&precision)->default_value('s'), "Options: s,d") @@ -258,6 +259,10 @@ int main(int argc, char* argv[]) { testing_coosort(argus); } + else if(function == "identity") + { + testing_identity(argus); + } else { fprintf(stderr, "Invalid value for --function\n"); diff --git a/library/src/conversion/rocsparse_coo2csr.cpp b/library/src/conversion/rocsparse_coo2csr.cpp index 741f0f11..e934aaad 100644 --- a/library/src/conversion/rocsparse_coo2csr.cpp +++ b/library/src/conversion/rocsparse_coo2csr.cpp @@ -22,7 +22,7 @@ extern "C" rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, return rocsparse_status_invalid_handle; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_coo2csr", (const void*&)coo_row_ind, @@ -31,6 +31,10 @@ extern "C" rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, (const void*&)csr_row_ptr, idx_base); + log_bench(handle, + "./rocsparse-bench -f coo2csr", + "--mtx "); + // Check sizes if(nnz < 0) { diff --git a/library/src/conversion/rocsparse_coosort.cpp b/library/src/conversion/rocsparse_coosort.cpp index 09ba1e34..b72ab1f0 100644 --- a/library/src/conversion/rocsparse_coosort.cpp +++ b/library/src/conversion/rocsparse_coosort.cpp @@ -29,7 +29,7 @@ extern "C" rocsparse_status rocsparse_coosort_buffer_size(rocsparse_handle handl return rocsparse_status_invalid_handle; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_coosort_buffer_size", m, @@ -135,7 +135,7 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, return rocsparse_status_invalid_handle; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_coosort_by_row", m, @@ -146,6 +146,10 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, (const void*&)perm, (const void*&)temp_buffer); + log_bench(handle, + "./rocsparse-bench -f coosort", + "--mtx "); + // Check sizes if(m < 0) { diff --git a/library/src/conversion/rocsparse_csr2coo.cpp b/library/src/conversion/rocsparse_csr2coo.cpp index 3cb5a901..b3332066 100644 --- a/library/src/conversion/rocsparse_csr2coo.cpp +++ b/library/src/conversion/rocsparse_csr2coo.cpp @@ -31,6 +31,10 @@ extern "C" rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, (const void*&)coo_row_ind, idx_base); + log_bench(handle, + "./rocsparse-bench -f csr2coo ", + "--mtx "); + // Check sizes if(nnz < 0) { diff --git a/library/src/conversion/rocsparse_csr2csc.cpp b/library/src/conversion/rocsparse_csr2csc.cpp index e72b8d02..a7b29755 100644 --- a/library/src/conversion/rocsparse_csr2csc.cpp +++ b/library/src/conversion/rocsparse_csr2csc.cpp @@ -29,7 +29,7 @@ extern "C" rocsparse_status rocsparse_csr2csc_buffer_size(rocsparse_handle handl return rocsparse_status_invalid_handle; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_csr2csc_buffer_size", m, diff --git a/library/src/conversion/rocsparse_csr2csc.hpp b/library/src/conversion/rocsparse_csr2csc.hpp index 64de0a3b..3dd05d57 100644 --- a/library/src/conversion/rocsparse_csr2csc.hpp +++ b/library/src/conversion/rocsparse_csr2csc.hpp @@ -36,7 +36,7 @@ rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, return rocsparse_status_invalid_handle; } - // Logging TODO bench logging + // Logging log_trace(handle, replaceX("rocsparse_Xcsr2csc"), m, @@ -52,6 +52,11 @@ rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, idx_base, (const void*&)temp_buffer); + log_bench(handle, + "./rocsparse-bench -f csr2csc -r", + replaceX("X"), + "--mtx "); + // Check index base if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) { diff --git a/library/src/conversion/rocsparse_csr2ell.cpp b/library/src/conversion/rocsparse_csr2ell.cpp index 8ff18d42..1505dcc8 100644 --- a/library/src/conversion/rocsparse_csr2ell.cpp +++ b/library/src/conversion/rocsparse_csr2ell.cpp @@ -32,7 +32,7 @@ extern "C" rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_csr2ell_width", m, diff --git a/library/src/conversion/rocsparse_csr2ell.hpp b/library/src/conversion/rocsparse_csr2ell.hpp index 80ce56fc..aeee6696 100644 --- a/library/src/conversion/rocsparse_csr2ell.hpp +++ b/library/src/conversion/rocsparse_csr2ell.hpp @@ -40,7 +40,7 @@ rocsparse_status rocsparse_csr2ell_template(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging log_trace(handle, replaceX("rocsparse_Xcsr2ell"), m, @@ -53,6 +53,11 @@ rocsparse_status rocsparse_csr2ell_template(rocsparse_handle handle, (const void*&)ell_val, (const void*&)ell_col_ind); + log_bench(handle, + "./rocsparse-bench -f csr2ell -r", + replaceX("X"), + "--mtx "); + // Check index base if(csr_descr->base != rocsparse_index_base_zero && csr_descr->base != rocsparse_index_base_one) { diff --git a/library/src/conversion/rocsparse_csr2hyb.hpp b/library/src/conversion/rocsparse_csr2hyb.hpp index c8a51b03..334a0b16 100644 --- a/library/src/conversion/rocsparse_csr2hyb.hpp +++ b/library/src/conversion/rocsparse_csr2hyb.hpp @@ -42,7 +42,7 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging log_trace(handle, replaceX("rocsparse_Xcsr2hyb"), m, @@ -55,6 +55,11 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, user_ell_width, partition_type); + log_bench(handle, + "./rocsparse-bench -f csr2hyb -r", + replaceX("X"), + "--mtx "); + // Check index base if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) { diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 3febe9cd..9a573f37 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -30,7 +30,7 @@ extern "C" rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handl return rocsparse_status_invalid_handle; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_csrsort_buffer_size", m, @@ -125,9 +125,9 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging log_trace(handle, - "rocsparse_csrsort_buffer_size", + "rocsparse_csrsort", m, n, nnz, @@ -137,6 +137,10 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, (const void*&)perm, (const void*&)temp_buffer); + log_bench(handle, + "./rocsparse-bench -f csrsort", + "--mtx "); + // Check sizes if(m < 0) { diff --git a/library/src/conversion/rocsparse_ell2csr.cpp b/library/src/conversion/rocsparse_ell2csr.cpp index f45c50f9..17bca740 100644 --- a/library/src/conversion/rocsparse_ell2csr.cpp +++ b/library/src/conversion/rocsparse_ell2csr.cpp @@ -41,7 +41,7 @@ extern "C" rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_ell2csr_nnz", m, diff --git a/library/src/conversion/rocsparse_ell2csr.hpp b/library/src/conversion/rocsparse_ell2csr.hpp index 61fc4e8a..ed2ba156 100644 --- a/library/src/conversion/rocsparse_ell2csr.hpp +++ b/library/src/conversion/rocsparse_ell2csr.hpp @@ -41,7 +41,7 @@ rocsparse_status rocsparse_ell2csr_template(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging log_trace(handle, replaceX("rocsparse_Xell2csr"), m, @@ -55,6 +55,11 @@ rocsparse_status rocsparse_ell2csr_template(rocsparse_handle handle, (const void*&)csr_row_ptr, (const void*&)csr_col_ind); + log_bench(handle, + "./rocsparse-bench -f ell2csr -r", + replaceX("X"), + "--mtx "); + // Check index base if(ell_descr->base != rocsparse_index_base_zero && ell_descr->base != rocsparse_index_base_one) { diff --git a/library/src/conversion/rocsparse_identity.cpp b/library/src/conversion/rocsparse_identity.cpp index fab8bd6b..16f442cd 100644 --- a/library/src/conversion/rocsparse_identity.cpp +++ b/library/src/conversion/rocsparse_identity.cpp @@ -18,9 +18,14 @@ rocsparse_create_identity_permutation(rocsparse_handle handle, rocsparse_int n, return rocsparse_status_invalid_handle; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_create_identity_permutation", n, (const void*&)p); + log_bench(handle, + "./rocsparse-bench -f identity", + "-n", + n); + // Check sizes if(n < 0) { diff --git a/library/src/level1/rocsparse_axpyi.hpp b/library/src/level1/rocsparse_axpyi.hpp index 30d1e254..f620fe1b 100644 --- a/library/src/level1/rocsparse_axpyi.hpp +++ b/library/src/level1/rocsparse_axpyi.hpp @@ -55,7 +55,7 @@ rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, return rocsparse_status_invalid_handle; } - // Logging // TODO bench logging + // Logging if(handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, @@ -65,6 +65,13 @@ rocsparse_status rocsparse_axpyi_template(rocsparse_handle handle, (const void*&)x_val, (const void*&)x_ind, (const void*&)y); + + log_bench(handle, + "./rocsparse-bench -f axpyi -r", + replaceX("X"), + "--mtx ", + "--alpha", + *alpha); } else { diff --git a/library/src/level1/rocsparse_doti.hpp b/library/src/level1/rocsparse_doti.hpp index 955f0b8b..1c5af9c6 100644 --- a/library/src/level1/rocsparse_doti.hpp +++ b/library/src/level1/rocsparse_doti.hpp @@ -29,7 +29,7 @@ rocsparse_status rocsparse_doti_template(rocsparse_handle handle, return rocsparse_status_invalid_handle; } - // Logging // TODO bench logging + // Logging if(handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, @@ -40,6 +40,11 @@ rocsparse_status rocsparse_doti_template(rocsparse_handle handle, (const void*&)y, *result, idx_base); + + log_bench(handle, + "./rocsparse-bench -f doti -r", + replaceX("X"), + "--mtx "); } else { diff --git a/library/src/level1/rocsparse_gthr.hpp b/library/src/level1/rocsparse_gthr.hpp index e57dc99e..70d5d067 100644 --- a/library/src/level1/rocsparse_gthr.hpp +++ b/library/src/level1/rocsparse_gthr.hpp @@ -27,7 +27,7 @@ rocsparse_status rocsparse_gthr_template(rocsparse_handle handle, return rocsparse_status_invalid_handle; } - // Logging // TODO bench logging + // Logging log_trace(handle, replaceX("rocsparse_Xgthr"), nnz, @@ -36,6 +36,11 @@ rocsparse_status rocsparse_gthr_template(rocsparse_handle handle, (const void*&)x_ind, idx_base); + log_bench(handle, + "./rocsparse-bench -f gthr -r", + replaceX("X"), + "--mtx "); + // Check index base if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) { diff --git a/library/src/level1/rocsparse_gthrz.hpp b/library/src/level1/rocsparse_gthrz.hpp index b8bb3652..be9c9fae 100644 --- a/library/src/level1/rocsparse_gthrz.hpp +++ b/library/src/level1/rocsparse_gthrz.hpp @@ -27,7 +27,7 @@ rocsparse_status rocsparse_gthrz_template(rocsparse_handle handle, return rocsparse_status_invalid_handle; } - // Logging // TODO bench logging + // Logging log_trace(handle, replaceX("rocsparse_Xgthrz"), nnz, @@ -36,6 +36,11 @@ rocsparse_status rocsparse_gthrz_template(rocsparse_handle handle, (const void*&)x_ind, idx_base); + log_bench(handle, + "./rocsparse-bench -f gthrz -r", + replaceX("X"), + "--mtx "); + // Check index base if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) { diff --git a/library/src/level1/rocsparse_sctr.hpp b/library/src/level1/rocsparse_sctr.hpp index 9ae50a5e..67716fed 100644 --- a/library/src/level1/rocsparse_sctr.hpp +++ b/library/src/level1/rocsparse_sctr.hpp @@ -27,7 +27,7 @@ rocsparse_status rocsparse_sctr_template(rocsparse_handle handle, return rocsparse_status_invalid_handle; } - // Logging // TODO bench logging + // Logging log_trace(handle, replaceX("rocsparse_Xsctr"), nnz, @@ -36,6 +36,11 @@ rocsparse_status rocsparse_sctr_template(rocsparse_handle handle, (const void*&)y, idx_base); + log_bench(handle, + "./rocsparse-bench -f sctr -r", + replaceX("X"), + "--mtx "); + // Check index base if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) { diff --git a/library/src/level2/rocsparse_coomv.hpp b/library/src/level2/rocsparse_coomv.hpp index 10b25396..fa2b7aaa 100644 --- a/library/src/level2/rocsparse_coomv.hpp +++ b/library/src/level2/rocsparse_coomv.hpp @@ -93,7 +93,7 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging if(handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, @@ -110,6 +110,15 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, (const void*&)x, *beta, (const void*&)y); + + log_bench(handle, + "./rocsparse-bench -f coomv -r", + replaceX("X"), + "--mtx ", + "--alpha", + *alpha, + "--beta", + *beta); } else { diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index 063aade9..a92ee0a2 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -68,7 +68,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging if(handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, @@ -85,6 +85,15 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, (const void*&)x, *beta, (const void*&)y); + + log_bench(handle, + "./rocsparse-bench -f csrmv -r", + replaceX("X"), + "--mtx " + "--alpha", + *alpha, + "--beta", + *beta); } else { diff --git a/library/src/level2/rocsparse_ellmv.hpp b/library/src/level2/rocsparse_ellmv.hpp index 246bd4b0..ae897766 100644 --- a/library/src/level2/rocsparse_ellmv.hpp +++ b/library/src/level2/rocsparse_ellmv.hpp @@ -68,7 +68,7 @@ rocsparse_status rocsparse_ellmv_template(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging if(handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, @@ -84,6 +84,15 @@ rocsparse_status rocsparse_ellmv_template(rocsparse_handle handle, (const void*&)x, *beta, (const void*&)y); + + log_bench(handle, + "./rocsparse-bench -f ellmv -r", + replaceX("X"), + "--mtx " + "--alpha", + *alpha, + "--beta", + *beta); } else { diff --git a/library/src/level2/rocsparse_hybmv.hpp b/library/src/level2/rocsparse_hybmv.hpp index 7b9ec378..092b0b3d 100644 --- a/library/src/level2/rocsparse_hybmv.hpp +++ b/library/src/level2/rocsparse_hybmv.hpp @@ -39,7 +39,7 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging if(handle->pointer_mode == rocsparse_pointer_mode_host) { log_trace(handle, @@ -51,6 +51,15 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, (const void*&)x, *beta, (const void*&)y); + + log_bench(handle, + "./rocsparse-bench -f hybmv -r", + replaceX("X"), + "--mtx " + "--alpha", + *alpha, + "--beta", + *beta); } else { From c9f40639028494e131dfa0e12abfc33023928f06 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Tue, 14 Aug 2018 08:11:04 +0200 Subject: [PATCH 208/304] 3hr timeout for CI --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6009f66a..a5dbe10c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4,7 +4,7 @@ // Mostly generated from snippet generator 'properties; set job properties' // Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM properties([ - pipelineTriggers([cron('0 22 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]), + pipelineTriggers([cron('0 23 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]), buildDiscarder(logRotator( artifactDaysToKeepStr: '', artifactNumToKeepStr: '', @@ -200,7 +200,7 @@ def docker_build_inside_image( def build_image, compiler_data compiler_args, doc stage( "Test ${compiler_args.compiler_name} ${compiler_args.build_config}" ) { // Cap the maximum amount of testing to be a few hours; assume failure if the time limit is hit - timeout(time: 2, unit: 'HOURS') + timeout(time: 3, unit: 'HOURS') { if(isJobStartedByTimer()) { From 49a6837c61869d985ed98c91aa43119902ce7da7 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 08:07:46 +0200 Subject: [PATCH 209/304] added index base to csrmv kernel --- library/src/level2/csrmv_device.h | 36 +++++++++++++++++++------------ 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 4b27bfaa..85085b47 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -187,7 +187,15 @@ static __device__ void csrmvn_general_device(rocsparse_int m, __device__ static __inline__ void atomic_add(float *address, float val) { - atomicAdd(address, val); + unsigned int newVal; + unsigned int prevVal; + + do + { + prevVal = __float_as_uint(*address); + newVal = __float_as_uint(val + *address); + } + while(atomicCAS((unsigned int*)address, prevVal, newVal) != prevVal); } __device__ static __inline__ void atomic_add(double *address, double val) @@ -268,10 +276,10 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row. // If there are more items in this row, we assign more workgroups. - unsigned int vecStart = hc::__mad24(wg, (unsigned int)(BLOCK_MULTIPLIER * BLOCKSIZE), (unsigned int)csr_row_ptr[row]); - unsigned int vecEnd = (csr_row_ptr[row + 1] > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) + unsigned int vecStart = hc::__mad24(wg, (unsigned int)(BLOCK_MULTIPLIER * BLOCKSIZE), (unsigned int)(csr_row_ptr[row] - idx_base)); + unsigned int vecEnd = ((csr_row_ptr[row + 1] - idx_base) > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE - : csr_row_ptr[row + 1]; + : (csr_row_ptr[row + 1] - idx_base); T temp_sum = 0.; @@ -309,11 +317,11 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // Stream all of this row block's matrix values into local memory. // Perform the matvec in parallel with this work. - unsigned int col = csr_row_ptr[row] + lid; + unsigned int col = csr_row_ptr[row] + lid - idx_base; if(gid != (gridDim.x - 1)) { for(int i = 0; i < BLOCKSIZE; i += WG_SIZE) - partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i]]; + partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i] - idx_base]; } else { @@ -324,8 +332,8 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // However, this may change in the future (e.g. with shared virtual memory.) // This causes a minor performance loss because this is the last workgroup // to be launched, and this loop can't be unrolled. - for(int i = 0; col + i < csr_row_ptr[stop_row]; i += WG_SIZE) - partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i]]; + for(int i = 0; col + i < csr_row_ptr[stop_row] - idx_base; i += WG_SIZE) + partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i] - idx_base]; } __syncthreads(); @@ -427,8 +435,8 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // Any workgroup only calculates, at most, BLOCKSIZE items in this row. // If there are more items in this row, we use CSR-LongRows. temp_sum = 0.; - vecStart = csr_row_ptr[row]; - vecEnd = csr_row_ptr[row + 1]; + vecStart = csr_row_ptr[row] - idx_base; + vecEnd = csr_row_ptr[row + 1] - idx_base; // Load in a bunch of partial results into your register space, rather than LDS (no // contention) @@ -437,7 +445,7 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // things. for(unsigned long long j = vecStart + lid; j < vecEnd; j += WG_SIZE) { - unsigned int col = csr_col_ind[(unsigned int)j]; + unsigned int col = csr_col_ind[(unsigned int)j] - idx_base; temp_sum += alpha * csr_val[(unsigned int)j] * x[col]; } @@ -513,18 +521,18 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // That increases register pressure and reduces occupancy. for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) { - temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; + temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j] - idx_base]; #if 2 * WG_SIZE <= BLOCK_MULTIPLIER * BLOCKSIZE // If you can, unroll this loop once. It somewhat helps performance. j += WG_SIZE; - temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; + temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j] - idx_base]; #endif } } else { for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) - temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j]]; + temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j] - idx_base]; } partialSums[lid] = temp_sum; From d8ce82a35cd267e267fec15dc0dcec28e2784bfb Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 08:09:21 +0200 Subject: [PATCH 210/304] changing single csrmv info struct to a unique globally available struct that holds all sub-info structs --- library/include/rocsparse-auxiliary.h | 27 +++--- library/include/rocsparse-functions.h | 10 +- library/include/rocsparse-types.h | 2 +- library/src/handle.cpp | 55 +++++++++++ library/src/include/handle.h | 64 ++++++++++--- library/src/level2/rocsparse_csrmv.cpp | 126 ++++++++++++++++++++++--- library/src/level2/rocsparse_csrmv.hpp | 46 ++++++++- library/src/rocsparse_auxiliary.cpp | 24 ++--- 8 files changed, 296 insertions(+), 58 deletions(-) diff --git a/library/include/rocsparse-auxiliary.h b/library/include/rocsparse-auxiliary.h index d5d12743..f2ad3e98 100644 --- a/library/include/rocsparse-auxiliary.h +++ b/library/include/rocsparse-auxiliary.h @@ -72,7 +72,7 @@ ROCSPARSE_EXPORT rocsparse_status rocsparse_get_version(rocsparse_handle handle, rocsparse_int* version); /******************************************************************************** - * \brief rocsparse_create_mat_descr_t is a structure holding the rocsparse matrix + * \brief rocsparse_mat_descr is a structure holding the rocsparse matrix * descriptor. It must be initialized using rocsparse_create_mat_descr() * and the retured handle must be passed to all subsequent library function * calls that involve the matrix. @@ -112,11 +112,10 @@ ROCSPARSE_EXPORT rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr); /******************************************************************************** - * \brief rocsparse_create_hyb_mat is a structure holding the rocsparse HYB - * matrix. It must be initialized using rocsparse_create_hyb_mat() - * and the retured handle must be passed to all subsequent library function - * calls that involve the HYB matrix. - * It should be destroyed at the end using rocsparse_destroy_hyb_mat(). + * \brief rocsparse_hyb_mat is a structure holding the rocsparse HYB matrix. It + * must be initialized using rocsparse_create_hyb_mat() and the returned handle + * must be passed to all subsequent library function calls that involve the HYB + * matrix. It should be destroyed at the end using rocsparse_destroy_hyb_mat(). *******************************************************************************/ ROCSPARSE_EXPORT rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat* hyb); @@ -128,20 +127,20 @@ ROCSPARSE_EXPORT rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb); /******************************************************************************** - * \brief rocsparse_create_csrmv_info is a structure holding the rocsparse - * csrmv info data gathered during csrmv_analysis. It must be initialized using - * rocsparse_create_csrmv_info() and the retured info structure must be passed - * to all subsequent csrmv adaptive function calls. It should be destroyed at - * the end using rocsparse_destroy_csrmv_info(). + * \brief rocsparse_mat_info is a structure holding the matrix info data that is + * gathered during the analysis routines. It must be initialized by calling + * rocsparse_create_mat_info() and the returned info structure must be passed + * to all subsequent function calls that require additional information. It + * should be destroyed at the end using rocsparse_destroy_mat_info(). *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info); +rocsparse_status rocsparse_create_mat_info(rocsparse_mat_info* info); /******************************************************************************** - * \brief Destroy csrmv info. + * \brief Destroy mat info. *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info); +rocsparse_status rocsparse_destroy_mat_info(rocsparse_mat_info info); #ifdef __cplusplus } diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index f2e4e95f..19df93f4 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -567,7 +567,7 @@ rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, const rocsparse_mat_descr descr, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, - rocsparse_csrmv_info info); + rocsparse_mat_info info); /*! \brief SPARSE Level 2 API @@ -632,7 +632,7 @@ rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, const float* x, const float* beta, float* y, - const rocsparse_csrmv_info info); + const rocsparse_mat_info info); ROCSPARSE_EXPORT rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, @@ -648,7 +648,7 @@ rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, const double* x, const double* beta, double* y, - const rocsparse_csrmv_info info); + const rocsparse_mat_info info); /* ROCSPARSE_EXPORT rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, @@ -664,7 +664,7 @@ rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, const rocsparse_float_complex* x, const rocsparse_float_complex* beta, rocsparse_float_complex* y, - const rocsparse_csrmv_info info); + const rocsparse_mat_info info); ROCSPARSE_EXPORT rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, @@ -680,7 +680,7 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, const rocsparse_double_complex* x, const rocsparse_double_complex* beta, rocsparse_double_complex* y, - const rocsparse_csrmv_info info); + const rocsparse_mat_info info); */ /*! \brief SPARSE Level 2 API diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index 420dc0f3..b3e272cb 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -23,7 +23,7 @@ typedef int32_t rocsparse_int; typedef struct _rocsparse_handle* rocsparse_handle; typedef struct _rocsparse_mat_descr* rocsparse_mat_descr; typedef struct _rocsparse_hyb_mat* rocsparse_hyb_mat; -typedef struct _rocsparse_csrmv_info* rocsparse_csrmv_info; +typedef struct _rocsparse_mat_info* rocsparse_mat_info; #ifdef __cplusplus extern "C" { diff --git a/library/src/handle.cpp b/library/src/handle.cpp index d873570d..a4f3a78a 100644 --- a/library/src/handle.cpp +++ b/library/src/handle.cpp @@ -84,3 +84,58 @@ rocsparse_status _rocsparse_handle::get_stream(hipStream_t* user_stream) const *user_stream = stream; return rocsparse_status_success; } + +/******************************************************************************** + * \brief rocsparse_csrmv_info is a structure holding the rocsparse csrmv info + * data gathered during csrmv_analysis. It must be initialized using the + * rocsparse_create_csrmv_info() routine. It should be destroyed at the end + * rocsparse_destroy_csrmv_info(). + *******************************************************************************/ +rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info) +{ + if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else + { + // Allocate + try + { + *info = new _rocsparse_csrmv_info; + } + catch(const rocsparse_status& status) + { + return status; + } + return rocsparse_status_success; + } +} + +/******************************************************************************** + * \brief Destroy csrmv info. + *******************************************************************************/ +rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info) +{ + if(info == nullptr) + { + return rocsparse_status_success; + } + + // Destruct + try + { + // Clean up row blocks + if(info->row_blocks != nullptr) + { + hipFree(info->row_blocks); + } + + delete info; + } + catch(const rocsparse_status& status) + { + return status; + } + return rocsparse_status_success; +} diff --git a/library/src/include/handle.h b/library/src/include/handle.h index 6f28fc77..8588d25e 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -12,6 +12,9 @@ #include #include +/*! \brief typedefs to opaque info structs */ +typedef struct _rocsparse_csrmv_info* rocsparse_csrmv_info; + /******************************************************************************** * \brief rocsparse_handle is a structure holding the rocsparse library context. * It must be initialized using rocsparse_create_handle() @@ -101,19 +104,56 @@ struct _rocsparse_hyb_mat }; /******************************************************************************** - * \brief rocsparse_create_csrmv_info is a structure holding the rocsparse - * csrmv info data gathered during csrmv_analysis. It must be initialized using - * rocsparse_create_csrmv_info() and the retured info structure must be passed - * to all subsequent csrmv adaptive function calls. It should be destroyed at - * the end using rocsparse_destroy_csrmv_info(). + * \brief rocsparse_mat_info is a structure holding the matrix info data that is + * gathered during the analysis routines. It must be initialized by calling + * rocsparse_create_mat_info() and the returned info structure must be passed + * to all subsequent function calls that require additional information. It + * should be destroyed at the end using rocsparse_destroy_mat_info(). + *******************************************************************************/ +struct _rocsparse_mat_info +{ + rocsparse_csrmv_info csrmv_info = nullptr; +}; + + + +/******************************************************************************** + * \brief rocsparse_csrmv_info is a structure holding the rocsparse csrmv info + * data gathered during csrmv_analysis. It must be initialized using the + * rocsparse_create_csrmv_info() routine. It should be destroyed at the end + * rocsparse_destroy_csrmv_info(). + *******************************************************************************/ +struct _rocsparse_csrmv_info +{ + // built flag + bool built = false; + // num row blocks + size_t size = 0; + // row blocks + unsigned long long* row_blocks = nullptr; + + // some data to verify correct execution + rocsparse_operation trans; + rocsparse_int m; + rocsparse_int n; + rocsparse_int nnz; + const _rocsparse_mat_descr* descr; + const rocsparse_int* csr_row_ptr; + const rocsparse_int* csr_col_ind; +}; + +/******************************************************************************** + * \brief rocsparse_csrmv_info is a structure holding the rocsparse csrmv info + * data gathered during csrmv_analysis. It must be initialized using the + * rocsparse_create_csrmv_info() routine. It should be destroyed at the end + * rocsparse_destroy_csrmv_info(). + *******************************************************************************/ +rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info); + +/******************************************************************************** + * \brief Destroy csrmv info. *******************************************************************************/ - struct _rocsparse_csrmv_info - { - // num row blocks - size_t size = 0; - // row blocks - unsigned long long* row_blocks = nullptr; - }; +rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info); /******************************************************************************** * \brief ELL format indexing diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 0fdeec22..01dabcdb 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -234,38 +234,142 @@ extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, const rocsparse_mat_descr descr, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, - rocsparse_csrmv_info info) + rocsparse_mat_info info) { + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_csrmv_analysis", + trans, + m, + n, + nnz, + (const void*&)descr, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info); + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(n < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || n == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Clear csrmv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrmv_info(info->csrmv_info)); + + // Create csrmv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrmv_info(&info->csrmv_info)); + // row blocks size - info->size = 0; + info->csrmv_info->size = 0; // Temporary arrays to hold device data std::vector hptr(m + 1); RETURN_IF_HIP_ERROR(hipMemcpy(hptr.data(), csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToHost)); // Determine row blocks array size - ComputeRowBlocks((unsigned long long*)NULL, info->size, hptr.data(), m, false); + ComputeRowBlocks((unsigned long long*)NULL, info->csrmv_info->size, hptr.data(), m, false); // Create row blocks structure - std::vector row_blocks(info->size, 0); + std::vector row_blocks(info->csrmv_info->size, 0); ComputeRowBlocks(row_blocks.data(), - info->size, + info->csrmv_info->size, hptr.data(), m, true); -printf("Required buffer size: %lu kByte\n", info->size * sizeof(unsigned long long) >> 10); - // Allocate memory on device to hold csrmv info - RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->row_blocks, sizeof(unsigned long long) * info->size)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->csrmv_info->row_blocks, sizeof(unsigned long long) * info->csrmv_info->size)); // Copy row blocks information to device - RETURN_IF_HIP_ERROR(hipMemcpy(info->row_blocks, row_blocks.data(), sizeof(unsigned long long) * info->size, hipMemcpyHostToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy(info->csrmv_info->row_blocks, row_blocks.data(), sizeof(unsigned long long) * info->csrmv_info->size, hipMemcpyHostToDevice)); + + // Store some pointers to verify correct execution + info->csrmv_info->trans = trans; + info->csrmv_info->m = m; + info->csrmv_info->n = n; + info->csrmv_info->nnz = nnz; + info->csrmv_info->descr = descr; + info->csrmv_info->csr_row_ptr = csr_row_ptr; + info->csrmv_info->csr_col_ind = csr_col_ind; + + // Set built flag + info->csrmv_info->built = true; return rocsparse_status_success; } +extern "C" rocsparse_status rocsparse_csrmv_analysis_clear(rocsparse_handle handle, + rocsparse_mat_info info) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging TODO bench logging + log_trace(handle, + "rocsparse_csrmv_analysis_clear", + (const void*&)info); + + return rocsparse_destroy_csrmv_info(info->csrmv_info); +} + extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, rocsparse_operation trans, rocsparse_int m, @@ -279,7 +383,7 @@ extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, const float* x, const float* beta, float* y, - const rocsparse_csrmv_info info) + const rocsparse_mat_info info) { return rocsparse_csrmv_template( handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); @@ -298,7 +402,7 @@ extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, const double* x, const double* beta, double* y, - const rocsparse_csrmv_info info) + const rocsparse_mat_info info) { return rocsparse_csrmv_template( handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index 70a86e72..e9c9c284 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -96,7 +96,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, const T* x, const T* beta, T* y, - const rocsparse_csrmv_info info) + const rocsparse_mat_info info) { // Check for valid handle and matrix descriptor if(handle == nullptr) @@ -124,7 +124,8 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, (const void*&)csr_col_ind, (const void*&)x, *beta, - (const void*&)y); + (const void*&)y, + (const void*&)info); } else { @@ -210,10 +211,15 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, // If csrmv info is not available, call csrmv general return rocsparse_csrmv_general_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } + else if(info->csrmv_info == nullptr) + { + // If csrmv info is not available, call csrmv general + return rocsparse_csrmv_general_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + } else { // If csrmv info is available, call csrmv adaptive - return rocsparse_csrmv_adaptive_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); + return rocsparse_csrmv_adaptive_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info->csrmv_info); } } @@ -675,6 +681,40 @@ rocsparse_status rocsparse_csrmv_adaptive_template(rocsparse_handle handle, T* y, const rocsparse_csrmv_info info) { + // Check if info matches current matrix and options + if(info->built == false) + { + return rocsparse_status_invalid_value; + } + else if(info->trans != trans) + { + return rocsparse_status_invalid_value; + } + else if(info->m != m) + { + return rocsparse_status_invalid_size; + } + else if(info->n != n) + { + return rocsparse_status_invalid_size; + } + else if(info->nnz != nnz) + { + return rocsparse_status_invalid_size; + } + else if(info->descr != descr) + { + return rocsparse_status_invalid_value; + } + else if(info->csr_row_ptr != csr_row_ptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info->csr_col_ind != csr_col_ind) + { + return rocsparse_status_invalid_pointer; + } + // Stream hipStream_t stream = handle->stream; diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index 9c6afe37..3ea01c06 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -3,6 +3,7 @@ * ************************************************************************ */ #include "handle.h" +#include "definitions.h" #include "rocsparse.h" #include "utility.h" @@ -317,13 +318,13 @@ rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb) } /******************************************************************************** - * \brief rocsparse_create_csrmv_info is a structure holding the rocsparse - * csrmv info data gathered during csrmv_analysis. It must be initialized using - * rocsparse_create_csrmv_info() and the retured info structure must be passed - * to all subsequent csrmv adaptive function calls. It should be destroyed at - * the end using rocsparse_destroy_csrmv_info(). + * \brief rocsparse_mat_info is a structure holding the matrix info data that is + * gathered during the analysis routines. It must be initialized by calling + * rocsparse_create_mat_info() and the returned info structure must be passed + * to all subsequent function calls that require additional information. It + * should be destroyed at the end using rocsparse_destroy_mat_info(). *******************************************************************************/ -rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info) +rocsparse_status rocsparse_create_mat_info(rocsparse_mat_info* info) { if(info == nullptr) { @@ -334,7 +335,7 @@ rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info) // Allocate try { - *info = new _rocsparse_csrmv_info; + *info = new _rocsparse_mat_info; } catch(const rocsparse_status& status) { @@ -345,17 +346,16 @@ rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info) } /******************************************************************************** - * \brief Destroy csrmv info. + * \brief Destroy mat info. *******************************************************************************/ -rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info) +rocsparse_status rocsparse_destroy_mat_info(rocsparse_mat_info info) { // Destruct try { - // Clean up row blocks - if(info->row_blocks != nullptr) + if(info->csrmv_info != nullptr) { - hipFree(info->row_blocks); + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrmv_info(info->csrmv_info)); } delete info; From 9be4e52857c33509aa901ff6247d0a4449214e62 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 08:10:01 +0200 Subject: [PATCH 211/304] test header adjustments for changed info structs --- clients/common/rocsparse_template_specialization.cpp | 10 ++++++---- clients/include/rocsparse.hpp | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 40034d52..3cf51a44 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -199,10 +199,11 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const float* x, const float* beta, - float* y) + float* y, + const rocsparse_csrmv_info info) { return rocsparse_scsrmv( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); } template <> @@ -218,10 +219,11 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const double* x, const double* beta, - double* y) + double* y, + const rocsparse_csrmv_info info) { return rocsparse_dcsrmv( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); } template <> diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index a74400b5..9351b686 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -89,7 +89,8 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const rocsparse_int* csr_col_ind, const T* x, const T* beta, - T* y); + T* y, + const rocsparse_csrmv_info info); template rocsparse_status rocsparse_ellmv(rocsparse_handle handle, From f5300aaa0c3b644187cd9ac346c3135ba032aeae Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 08:11:58 +0200 Subject: [PATCH 212/304] csrmv test update for adaptive case --- clients/include/rocsparse_test_unique_ptr.hpp | 16 ++ clients/include/testing_csrmv.hpp | 265 +++++++++++++----- clients/tests/test_csrmv.cpp | 8 +- 3 files changed, 212 insertions(+), 77 deletions(-) diff --git a/clients/include/rocsparse_test_unique_ptr.hpp b/clients/include/rocsparse_test_unique_ptr.hpp index 4c0fe481..444d94cc 100644 --- a/clients/include/rocsparse_test_unique_ptr.hpp +++ b/clients/include/rocsparse_test_unique_ptr.hpp @@ -87,6 +87,22 @@ struct hyb_struct } }; +struct csrmv_info_struct +{ + rocsparse_csrmv_info info; + csrmv_info_struct() + { + rocsparse_status status = rocsparse_create_csrmv_info(&info); + verify_rocsparse_status_success(status, "ERROR: csrmv_info_struct constructor"); + } + + ~csrmv_info_struct() + { + rocsparse_status status = rocsparse_destroy_csrmv_info(info); + verify_rocsparse_status_success(status, "ERROR: csrmv_info_struct destructor"); + } +}; + } // namespace rocsparse_test using rocsparse_unique_ptr = std::unique_ptr; diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index d125de25..4230295c 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -36,6 +36,9 @@ void testing_csrmv_bad_arg(void) std::unique_ptr unique_ptr_descr(new descr_struct); rocsparse_mat_descr descr = unique_ptr_descr->descr; + std::unique_ptr unique_ptr_csrmv_info(new csrmv_info_struct); + rocsparse_csrmv_info info = unique_ptr_csrmv_info->info; + auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; auto dcol_managed = @@ -56,12 +59,57 @@ void testing_csrmv_bad_arg(void) return; } + // testing rocsparse_csrmv_analysis + + // testing for(nullptr == dptr) + { + rocsparse_int* dptr_null = nullptr; + + status = rocsparse_csrmv_analysis( + handle, trans, m, n, nnz, descr, dptr_null, dcol, info); + verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); + } + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_csrmv_analysis( + handle, trans, m, n, nnz, descr, dptr, dcol_null, info); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_csrmv_analysis( + handle, trans, m, n, nnz, descr_null, dptr, dcol, info); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_csrmv_info info_null = nullptr; + + status = rocsparse_csrmv_analysis( + handle, trans, m, n, nnz, descr, dptr, dcol, info_null); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrmv_analysis( + handle_null, trans, m, n, nnz, descr, dptr, dcol, info); + verify_rocsparse_status_invalid_handle(status); + } + + // testing rocsparse_csrmv + // testing for(nullptr == dptr) { rocsparse_int* dptr_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr_null, dcol, dx, &beta, dy); + handle, trans, m, n, nnz, &alpha, descr, dval, dptr_null, dcol, dx, &beta, dy, nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); } // testing for(nullptr == dcol) @@ -69,7 +117,7 @@ void testing_csrmv_bad_arg(void) rocsparse_int* dcol_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol_null, dx, &beta, dy); + handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol_null, dx, &beta, dy, nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } // testing for(nullptr == dval) @@ -77,7 +125,7 @@ void testing_csrmv_bad_arg(void) T* dval_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval_null, dptr, dcol, dx, &beta, dy); + handle, trans, m, n, nnz, &alpha, descr, dval_null, dptr, dcol, dx, &beta, dy, nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); } // testing for(nullptr == dx) @@ -85,7 +133,7 @@ void testing_csrmv_bad_arg(void) T* dx_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx_null, &beta, dy); + handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx_null, &beta, dy, nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); } // testing for(nullptr == dy) @@ -93,7 +141,7 @@ void testing_csrmv_bad_arg(void) T* dy_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy_null); + handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy_null, nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); } // testing for(nullptr == d_alpha) @@ -101,7 +149,7 @@ void testing_csrmv_bad_arg(void) T* d_alpha_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, d_alpha_null, descr, dval, dptr, dcol, dx, &beta, dy); + handle, trans, m, n, nnz, d_alpha_null, descr, dval, dptr, dcol, dx, &beta, dy, nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for(nullptr == d_beta) @@ -109,7 +157,7 @@ void testing_csrmv_bad_arg(void) T* d_beta_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, d_beta_null, dy); + handle, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, d_beta_null, dy, nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } // testing for(nullptr == descr) @@ -117,7 +165,7 @@ void testing_csrmv_bad_arg(void) rocsparse_mat_descr descr_null = nullptr; status = rocsparse_csrmv( - handle, trans, m, n, nnz, &alpha, descr_null, dval, dptr, dcol, dx, &beta, dy); + handle, trans, m, n, nnz, &alpha, descr_null, dval, dptr, dcol, dx, &beta, dy, nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == handle) @@ -125,11 +173,20 @@ void testing_csrmv_bad_arg(void) rocsparse_handle handle_null = nullptr; status = rocsparse_csrmv( - handle_null, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy); + handle_null, trans, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy, nullptr); verify_rocsparse_status_invalid_handle(status); } } +template +static T two_sum(T x, T y, T* sumk_err) +{ + T sumk_s = x + y; + T bp = sumk_s - x; + (*sumk_err) += ((x - (sumk_s - bp)) + (y - bp)); + return sumk_s; +} + template rocsparse_status testing_csrmv(Arguments argus) { @@ -140,6 +197,7 @@ rocsparse_status testing_csrmv(Arguments argus) T h_beta = argus.beta; rocsparse_operation trans = argus.trans; rocsparse_index_base idx_base = argus.idx_base; + bool adaptive = argus.bswitch; std::string binfile = ""; std::string filename = ""; rocsparse_status status; @@ -163,6 +221,14 @@ rocsparse_status testing_csrmv(Arguments argus) std::unique_ptr test_descr(new descr_struct); rocsparse_mat_descr descr = test_descr->descr; + std::unique_ptr unique_ptr_csrmv_info(new csrmv_info_struct); + rocsparse_csrmv_info info = nullptr; + + if(adaptive) + { + info = unique_ptr_csrmv_info->info; + } + // Set matrix index base CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); @@ -198,9 +264,24 @@ rocsparse_status testing_csrmv(Arguments argus) return rocsparse_status_memory_error; } + if(adaptive) + { + // Test rocsparse_csrmv_analysis + status = rocsparse_csrmv_analysis(handle, trans, m, n, nnz, descr, dptr, dcol, info); + + if(m < 0 || n < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || n < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); + } + } + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); status = rocsparse_csrmv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy); + handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy, info); if(m < 0 || n < 0 || nnz < 0) { @@ -320,6 +401,12 @@ rocsparse_status testing_csrmv(Arguments argus) CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); + if(adaptive) + { + // csrmv analysis + CHECK_ROCSPARSE_ERROR(rocsparse_csrmv_analysis(handle, trans, m, n, nnz, descr, dptr, dcol, info)); + } + if(argus.unit_check) { CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * m, hipMemcpyHostToDevice)); @@ -327,12 +414,12 @@ rocsparse_status testing_csrmv(Arguments argus) // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); CHECK_ROCSPARSE_ERROR(rocsparse_csrmv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1)); + handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1, info)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); CHECK_ROCSPARSE_ERROR(rocsparse_csrmv( - handle, trans, m, n, nnz, d_alpha, descr, dval, dptr, dcol, dx, d_beta, dy_2)); + handle, trans, m, n, nnz, d_alpha, descr, dval, dptr, dcol, dx, d_beta, dy_2, info)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); @@ -341,84 +428,112 @@ rocsparse_status testing_csrmv(Arguments argus) // CPU - do the csrmv row reduction in the same order as the GPU double cpu_time_used = get_time_us(); - // Query for warpSize - hipDeviceProp_t prop; - hipGetDeviceProperties(&prop, 0); + // Different csrmv algorithms require different CPU summation + if(adaptive) + { + for(rocsparse_int i = 0; i < m; ++i) + { + hy_gold[i] *= h_beta; + T sum = hy_gold[i]; + T err = static_cast(0); - rocsparse_int WF_SIZE; - rocsparse_int nnz_per_row = nnz / m; + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; ++j) + { + sum = two_sum(sum, h_alpha * hval[j] * hx[hcol_ind[j] - idx_base], &err); + } - if(prop.warpSize == 32) - { - if(nnz_per_row < 4) - WF_SIZE = 2; - else if(nnz_per_row < 8) - WF_SIZE = 4; - else if(nnz_per_row < 16) - WF_SIZE = 8; - else if(nnz_per_row < 32) - WF_SIZE = 16; - else - WF_SIZE = 32; - } - else if(prop.warpSize == 64) - { - if(nnz_per_row < 4) - WF_SIZE = 2; - else if(nnz_per_row < 8) - WF_SIZE = 4; - else if(nnz_per_row < 16) - WF_SIZE = 8; - else if(nnz_per_row < 32) - WF_SIZE = 16; - else if(nnz_per_row < 64) - WF_SIZE = 32; - else - WF_SIZE = 64; + hy_gold[i] = (T)(sum + err); + } } else { - return rocsparse_status_internal_error; - } - - for(rocsparse_int i = 0; i < m; ++i) - { - std::vector sum(WF_SIZE, 0.0); - - for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; - j += WF_SIZE) + // Query for warpSize + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, 0); + + rocsparse_int WF_SIZE; + rocsparse_int nnz_per_row = nnz / m; + + if(prop.warpSize == 32) { - for(rocsparse_int k = 0; k < WF_SIZE; ++k) - { - if(j + k < hcsr_row_ptr[i + 1] - idx_base) - { - sum[k] = fma(h_alpha * hval[j + k], hx[hcol_ind[j + k] - idx_base], sum[k]); - } - } + if(nnz_per_row < 4) + WF_SIZE = 2; + else if(nnz_per_row < 8) + WF_SIZE = 4; + else if(nnz_per_row < 16) + WF_SIZE = 8; + else if(nnz_per_row < 32) + WF_SIZE = 16; + else + WF_SIZE = 32; } - - for(rocsparse_int j = 1; j < WF_SIZE; j <<= 1) + else if(prop.warpSize == 64) { - for(rocsparse_int k = 0; k < WF_SIZE - j; ++k) - { - sum[k] += sum[k + j]; - } + if(nnz_per_row < 4) + WF_SIZE = 2; + else if(nnz_per_row < 8) + WF_SIZE = 4; + else if(nnz_per_row < 16) + WF_SIZE = 8; + else if(nnz_per_row < 32) + WF_SIZE = 16; + else if(nnz_per_row < 64) + WF_SIZE = 32; + else + WF_SIZE = 64; } - - if(h_beta == 0.0) + else { - hy_gold[i] = sum[0]; + return rocsparse_status_internal_error; } - else + + for(rocsparse_int i = 0; i < m; ++i) { - hy_gold[i] = std::fma(h_beta, hy_gold[i], sum[0]); + std::vector sum(WF_SIZE, 0.0); + + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; + j += WF_SIZE) + { + for(rocsparse_int k = 0; k < WF_SIZE; ++k) + { + if(j + k < hcsr_row_ptr[i + 1] - idx_base) + { + sum[k] = fma(h_alpha * hval[j + k], hx[hcol_ind[j + k] - idx_base], sum[k]); + } + } + } + + for(rocsparse_int j = 1; j < WF_SIZE; j <<= 1) + { + for(rocsparse_int k = 0; k < WF_SIZE - j; ++k) + { + sum[k] += sum[k + j]; + } + } + + if(h_beta == 0.0) + { + hy_gold[i] = sum[0]; + } + else + { + hy_gold[i] = std::fma(h_beta, hy_gold[i], sum[0]); + } } } cpu_time_used = get_time_us() - cpu_time_used; - unit_check_general(1, m, hy_gold.data(), hy_1.data()); - unit_check_general(1, m, hy_gold.data(), hy_2.data()); + if(adaptive) + { + unit_check_near(1, m, hy_gold.data(), hy_1.data()); + unit_check_near(1, m, hy_gold.data(), hy_2.data()); + } + else + { + unit_check_general(1, m, hy_gold.data(), hy_1.data()); + unit_check_general(1, m, hy_gold.data(), hy_2.data()); + } } if(argus.timing) @@ -430,7 +545,7 @@ rocsparse_status testing_csrmv(Arguments argus) for(int iter = 0; iter < number_cold_calls; iter++) { rocsparse_csrmv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); + handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1, info); } double gpu_time_used = get_time_us(); // in microseconds @@ -438,7 +553,7 @@ rocsparse_status testing_csrmv(Arguments argus) for(int iter = 0; iter < number_hot_calls; iter++) { rocsparse_csrmv( - handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1); + handle, trans, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1, info); } // Convert to miliseconds per call diff --git a/clients/tests/test_csrmv.cpp b/clients/tests/test_csrmv.cpp index f02dc39c..23e061d1 100644 --- a/clients/tests/test_csrmv.cpp +++ b/clients/tests/test_csrmv.cpp @@ -12,7 +12,7 @@ #include typedef rocsparse_index_base base; -typedef std::tuple csrmv_tuple; +typedef std::tuple csrmv_tuple; int csr_M_range[] = {-99, -1, 0, 500, 7111}; int csr_N_range[] = {-99, 0, 842, 4441}; @@ -37,6 +37,8 @@ std::string csr_bin[] = {"rma10.bin", "nos6.bin", "nos7.bin"}; +bool csr_adaptive[] = {false, true}; + class parameterized_csrmv : public testing::TestWithParam { protected: @@ -54,6 +56,7 @@ Arguments setup_csrmv_arguments(csrmv_tuple tup) arg.alpha = std::get<2>(tup); arg.beta = std::get<3>(tup); arg.idx_base = std::get<4>(tup); + arg.bswitch = std::get<6>(tup); arg.timing = 0; // Determine absolute path of test matrix @@ -102,4 +105,5 @@ INSTANTIATE_TEST_CASE_P(csrmv, testing::ValuesIn(csr_alpha_range), testing::ValuesIn(csr_beta_range), testing::ValuesIn(csr_idxbase_range), - testing::ValuesIn(csr_bin))); + testing::ValuesIn(csr_bin), + testing::ValuesIn(csr_adaptive))); From 890ac2f3f798aa1637e22346f827901b8eaf039d Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 11:20:43 +0200 Subject: [PATCH 213/304] fixed samples to work with rocsparse_int == int64_t --- clients/samples/example_coomv.cpp | 34 +++++++++++++++---------------- clients/samples/example_csrmv.cpp | 28 ++++++++++++------------- clients/samples/example_ellmv.cpp | 28 ++++++++++++------------- clients/samples/example_hybmv.cpp | 28 ++++++++++++------------- 4 files changed, 59 insertions(+), 59 deletions(-) diff --git a/clients/samples/example_coomv.cpp b/clients/samples/example_coomv.cpp index ec5cf2c4..9f8f6284 100644 --- a/clients/samples/example_coomv.cpp +++ b/clients/samples/example_coomv.cpp @@ -18,9 +18,9 @@ int main(int argc, char* argv[]) return -1; } - int ndim = atoi(argv[1]); - int trials = 200; - int batch_size = 1; + rocsparse_int ndim = atoi(argv[1]); + int trials = 200; + int batch_size = 1; if(argc > 2) { @@ -43,19 +43,19 @@ int main(int argc, char* argv[]) printf("Device: %s\n", devProp.name); // Generate problem - std::vector hAptr; - std::vector hAcol; + std::vector hAptr; + std::vector hAcol; std::vector hAval; - int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); - int n = m; - int nnz = hAptr[m]; + rocsparse_int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); + rocsparse_int n = m; + rocsparse_int nnz = hAptr[m]; // Convert to COO matrix - std::vector hArow(nnz); + std::vector hArow(nnz); - for(int i = 0; i < m; ++i) + for(rocsparse_int i = 0; i < m; ++i) { - for(int j = hAptr[i]; j < hAptr[i + 1]; ++j) + for(rocsparse_int j = hAptr[i]; j < hAptr[i + 1]; ++j) { hArow[j] = i; } @@ -75,20 +75,20 @@ int main(int argc, char* argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - int* dArow = NULL; - int* dAcol = NULL; + rocsparse_int* dArow = NULL; + rocsparse_int* dAcol = NULL; double* dAval = NULL; double* dx = NULL; double* dy = NULL; - hipMalloc((void**)&dArow, sizeof(int) * nnz); - hipMalloc((void**)&dAcol, sizeof(int) * nnz); + hipMalloc((void**)&dArow, sizeof(rocsparse_int) * nnz); + hipMalloc((void**)&dAcol, sizeof(rocsparse_int) * nnz); hipMalloc((void**)&dAval, sizeof(double) * nnz); hipMalloc((void**)&dx, sizeof(double) * n); hipMalloc((void**)&dy, sizeof(double) * m); - hipMemcpy(dArow, hArow.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); - hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dArow, hArow.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice); hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); hipMemcpy(dx, hx.data(), sizeof(double) * n, hipMemcpyHostToDevice); diff --git a/clients/samples/example_csrmv.cpp b/clients/samples/example_csrmv.cpp index b5063b4f..d58a916f 100644 --- a/clients/samples/example_csrmv.cpp +++ b/clients/samples/example_csrmv.cpp @@ -18,9 +18,9 @@ int main(int argc, char* argv[]) return -1; } - int ndim = atoi(argv[1]); - int trials = 200; - int batch_size = 1; + rocsparse_int ndim = atoi(argv[1]); + int trials = 200; + int batch_size = 1; if(argc > 2) { @@ -43,12 +43,12 @@ int main(int argc, char* argv[]) printf("Device: %s\n", devProp.name); // Generate problem - std::vector hAptr; - std::vector hAcol; + std::vector hAptr; + std::vector hAcol; std::vector hAval; - int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); - int n = m; - int nnz = hAptr[m]; + rocsparse_int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); + rocsparse_int n = m; + rocsparse_int nnz = hAptr[m]; // Sample some random data srand(12345ULL); @@ -64,20 +64,20 @@ int main(int argc, char* argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - int* dAptr = NULL; - int* dAcol = NULL; + rocsparse_int* dAptr = NULL; + rocsparse_int* dAcol = NULL; double* dAval = NULL; double* dx = NULL; double* dy = NULL; - hipMalloc((void**)&dAptr, sizeof(int) * (m + 1)); - hipMalloc((void**)&dAcol, sizeof(int) * nnz); + hipMalloc((void**)&dAptr, sizeof(rocsparse_int) * (m + 1)); + hipMalloc((void**)&dAcol, sizeof(rocsparse_int) * nnz); hipMalloc((void**)&dAval, sizeof(double) * nnz); hipMalloc((void**)&dx, sizeof(double) * n); hipMalloc((void**)&dy, sizeof(double) * m); - hipMemcpy(dAptr, hAptr.data(), sizeof(int) * (m + 1), hipMemcpyHostToDevice); - hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dAptr, hAptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice); hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); hipMemcpy(dx, hx.data(), sizeof(double) * n, hipMemcpyHostToDevice); diff --git a/clients/samples/example_ellmv.cpp b/clients/samples/example_ellmv.cpp index 2cf296b2..8d49d837 100644 --- a/clients/samples/example_ellmv.cpp +++ b/clients/samples/example_ellmv.cpp @@ -18,9 +18,9 @@ int main(int argc, char* argv[]) return -1; } - int ndim = atoi(argv[1]); - int trials = 200; - int batch_size = 1; + rocsparse_int ndim = atoi(argv[1]); + int trials = 200; + int batch_size = 1; if(argc > 2) { @@ -43,12 +43,12 @@ int main(int argc, char* argv[]) printf("Device: %s\n", devProp.name); // Generate problem in CSR format - std::vector hAptr; - std::vector hAcol; + std::vector hAptr; + std::vector hAcol; std::vector hAval; - int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); - int n = m; - int nnz = hAptr[m]; + rocsparse_int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); + rocsparse_int n = m; + rocsparse_int nnz = hAptr[m]; // Sample some random data srand(12345ULL); @@ -64,20 +64,20 @@ int main(int argc, char* argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - int* dAptr = NULL; - int* dAcol = NULL; + rocsparse_int* dAptr = NULL; + rocsparse_int* dAcol = NULL; double* dAval = NULL; double* dx = NULL; double* dy = NULL; - hipMalloc((void**)&dAptr, sizeof(int) * (m + 1)); - hipMalloc((void**)&dAcol, sizeof(int) * nnz); + hipMalloc((void**)&dAptr, sizeof(rocsparse_int) * (m + 1)); + hipMalloc((void**)&dAcol, sizeof(rocsparse_int) * nnz); hipMalloc((void**)&dAval, sizeof(double) * nnz); hipMalloc((void**)&dx, sizeof(double) * n); hipMalloc((void**)&dy, sizeof(double) * m); - hipMemcpy(dAptr, hAptr.data(), sizeof(int) * (m + 1), hipMemcpyHostToDevice); - hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dAptr, hAptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice); hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); hipMemcpy(dx, hx.data(), sizeof(double) * n, hipMemcpyHostToDevice); diff --git a/clients/samples/example_hybmv.cpp b/clients/samples/example_hybmv.cpp index 2bdcec6b..27283d07 100644 --- a/clients/samples/example_hybmv.cpp +++ b/clients/samples/example_hybmv.cpp @@ -18,9 +18,9 @@ int main(int argc, char* argv[]) return -1; } - int ndim = atoi(argv[1]); - int trials = 200; - int batch_size = 1; + rocsparse_int ndim = atoi(argv[1]); + int trials = 200; + int batch_size = 1; if(argc > 2) { @@ -43,12 +43,12 @@ int main(int argc, char* argv[]) printf("Device: %s\n", devProp.name); // Generate problem in CSR format - std::vector hAptr; - std::vector hAcol; + std::vector hAptr; + std::vector hAcol; std::vector hAval; - int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); - int n = m; - int nnz = hAptr[m]; + rocsparse_int m = gen_2d_laplacian(ndim, hAptr, hAcol, hAval, rocsparse_index_base_zero); + rocsparse_int n = m; + rocsparse_int nnz = hAptr[m]; // Sample some random data srand(12345ULL); @@ -64,20 +64,20 @@ int main(int argc, char* argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - int* dAptr = NULL; - int* dAcol = NULL; + rocsparse_int* dAptr = NULL; + rocsparse_int* dAcol = NULL; double* dAval = NULL; double* dx = NULL; double* dy = NULL; - hipMalloc((void**)&dAptr, sizeof(int) * (m + 1)); - hipMalloc((void**)&dAcol, sizeof(int) * nnz); + hipMalloc((void**)&dAptr, sizeof(rocsparse_int) * (m + 1)); + hipMalloc((void**)&dAcol, sizeof(rocsparse_int) * nnz); hipMalloc((void**)&dAval, sizeof(double) * nnz); hipMalloc((void**)&dx, sizeof(double) * n); hipMalloc((void**)&dy, sizeof(double) * m); - hipMemcpy(dAptr, hAptr.data(), sizeof(int) * (m + 1), hipMemcpyHostToDevice); - hipMemcpy(dAcol, hAcol.data(), sizeof(int) * nnz, hipMemcpyHostToDevice); + hipMemcpy(dAptr, hAptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice); + hipMemcpy(dAcol, hAcol.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice); hipMemcpy(dAval, hAval.data(), sizeof(double) * nnz, hipMemcpyHostToDevice); hipMemcpy(dx, hx.data(), sizeof(double) * n, hipMemcpyHostToDevice); From 80df7b4bb4cb6e1635f06bf0b2f2b7e8462290eb Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 11:22:11 +0200 Subject: [PATCH 214/304] fixes so that rocsparse_int == int64_t works --- clients/common/utility.cpp | 2 +- library/src/include/handle.h | 4 ++-- library/src/level3/rocsparse_csrmm.hpp | 11 ++++++----- library/src/rocsparse_auxiliary.cpp | 20 ++++++++++++++------ 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/clients/common/utility.cpp b/clients/common/utility.cpp index 7307e6df..9d371bea 100644 --- a/clients/common/utility.cpp +++ b/clients/common/utility.cpp @@ -29,7 +29,7 @@ rocsparse_int query_device_property() printf("Query device success: there are %d devices\n", device_count); } - for(rocsparse_int i = 0; i < device_count; i++) + for(int i = 0; i < device_count; i++) { hipDeviceProp_t props; rocsparse_status status = (rocsparse_status)hipGetDeviceProperties(&props, i); diff --git a/library/src/include/handle.h b/library/src/include/handle.h index 8588d25e..73283297 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -35,11 +35,11 @@ struct _rocsparse_handle rocsparse_status get_stream(hipStream_t* user_stream) const; // device id - rocsparse_int device; + int device; // device properties hipDeviceProp_t properties; // device warp size - rocsparse_int warp_size; + int warp_size; // stream ; default stream is system stream NULL hipStream_t stream = 0; // pointer mode ; default mode is host diff --git a/library/src/level3/rocsparse_csrmm.hpp b/library/src/level3/rocsparse_csrmm.hpp index c37f45d3..627113c4 100644 --- a/library/src/level3/rocsparse_csrmm.hpp +++ b/library/src/level3/rocsparse_csrmm.hpp @@ -278,18 +278,19 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, } // Check leading dimension of B + rocsparse_int one = 1; if(trans_B == rocsparse_operation_none) { if(trans_A == rocsparse_operation_none) { - if(ldb < std::max(1, k)) + if(ldb < std::max(one, k)) { return rocsparse_status_invalid_size; } } else { - if(ldb < std::max(1, m)) + if(ldb < std::max(one, m)) { return rocsparse_status_invalid_size; } @@ -297,7 +298,7 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, } else { - if(ldb < std::max(1, n)) + if(ldb < std::max(one, n)) { return rocsparse_status_invalid_size; } @@ -306,14 +307,14 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, // Check leading dimension of C if(trans_A == rocsparse_operation_none) { - if(ldc < std::max(1, m)) + if(ldc < std::max(one, m)) { return rocsparse_status_invalid_size; } } else { - if(ldc < std::max(1, k)) + if(ldc < std::max(one, k)) { return rocsparse_status_invalid_size; } diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index e9a6d413..4f4520e5 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -128,7 +128,7 @@ rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t* stre * version / 100 % 1000 = minor version * version / 100000 = major version *******************************************************************************/ -rocsparse_status rocsparse_get_version(rocsparse_handle handle, rocsparse_int* version) +rocsparse_status rocsparse_get_version(rocsparse_handle handle, int* version) { // Check if handle is valid if(handle == nullptr) @@ -137,7 +137,9 @@ rocsparse_status rocsparse_get_version(rocsparse_handle handle, rocsparse_int* v } *version = ROCSPARSE_VERSION_MAJOR * 100000 + ROCSPARSE_VERSION_MINOR * 100 + ROCSPARSE_VERSION_PATCH; + log_trace(handle, "rocsparse_get_version", *version); + return rocsparse_status_success; } @@ -351,14 +353,20 @@ rocsparse_status rocsparse_create_mat_info(rocsparse_mat_info* info) *******************************************************************************/ rocsparse_status rocsparse_destroy_mat_info(rocsparse_mat_info info) { + if(info == nullptr) + { + return rocsparse_status_success; + } + + // Clear csrmv info struct + if(info->csrmv_info != nullptr) + { + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrmv_info(info->csrmv_info)); + } + // Destruct try { - if(info->csrmv_info != nullptr) - { - RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrmv_info(info->csrmv_info)); - } - delete info; } catch(const rocsparse_status& status) From e9a2919601e12548f7702bc1efd76b71eebfbdc2 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 11:23:25 +0200 Subject: [PATCH 215/304] rocsparse_int == int64_t fix #2 --- library/include/rocsparse-auxiliary.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/include/rocsparse-auxiliary.h b/library/include/rocsparse-auxiliary.h index f2ad3e98..ea19ceb3 100644 --- a/library/include/rocsparse-auxiliary.h +++ b/library/include/rocsparse-auxiliary.h @@ -69,7 +69,7 @@ rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, * version / 100000 = major version *******************************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_get_version(rocsparse_handle handle, rocsparse_int* version); +rocsparse_status rocsparse_get_version(rocsparse_handle handle, int* version); /******************************************************************************** * \brief rocsparse_mat_descr is a structure holding the rocsparse matrix From e49fc3d6f5349a2ee1dfa505a5962214868ffec3 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 11:24:15 +0200 Subject: [PATCH 216/304] added csrmv_analysis_clear() and adjusted tests for new global info struct --- .../rocsparse_template_specialization.cpp | 4 +- clients/include/rocsparse.hpp | 2 +- clients/include/rocsparse_test_unique_ptr.hpp | 16 ++++---- clients/include/testing_csrmv.hpp | 40 ++++++++++++++++--- clients/tests/test_csrmv.cpp | 2 +- library/include/rocsparse-functions.h | 22 ++++++++++ library/src/handle.cpp | 12 +++--- 7 files changed, 74 insertions(+), 24 deletions(-) diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index a7c59ebf..a8c7320c 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -200,7 +200,7 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const float* x, const float* beta, float* y, - const rocsparse_csrmv_info info) + const rocsparse_mat_info info) { return rocsparse_scsrmv( handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); @@ -220,7 +220,7 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const double* x, const double* beta, double* y, - const rocsparse_csrmv_info info) + const rocsparse_mat_info info) { return rocsparse_dcsrmv( handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index 8caf89ef..fb025f0c 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -90,7 +90,7 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const T* x, const T* beta, T* y, - const rocsparse_csrmv_info info); + const rocsparse_mat_info info); template rocsparse_status rocsparse_ellmv(rocsparse_handle handle, diff --git a/clients/include/rocsparse_test_unique_ptr.hpp b/clients/include/rocsparse_test_unique_ptr.hpp index 444d94cc..ebcdc228 100644 --- a/clients/include/rocsparse_test_unique_ptr.hpp +++ b/clients/include/rocsparse_test_unique_ptr.hpp @@ -87,19 +87,19 @@ struct hyb_struct } }; -struct csrmv_info_struct +struct mat_info_struct { - rocsparse_csrmv_info info; - csrmv_info_struct() + rocsparse_mat_info info; + mat_info_struct() { - rocsparse_status status = rocsparse_create_csrmv_info(&info); - verify_rocsparse_status_success(status, "ERROR: csrmv_info_struct constructor"); + rocsparse_status status = rocsparse_create_mat_info(&info); + verify_rocsparse_status_success(status, "ERROR: mat_info_struct constructor"); } - ~csrmv_info_struct() + ~mat_info_struct() { - rocsparse_status status = rocsparse_destroy_csrmv_info(info); - verify_rocsparse_status_success(status, "ERROR: csrmv_info_struct destructor"); + rocsparse_status status = rocsparse_destroy_mat_info(info); + verify_rocsparse_status_success(status, "ERROR: mat_info_struct destructor"); } }; diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index fba17688..c9bdc75f 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -36,8 +36,8 @@ void testing_csrmv_bad_arg(void) std::unique_ptr unique_ptr_descr(new descr_struct); rocsparse_mat_descr descr = unique_ptr_descr->descr; - std::unique_ptr unique_ptr_csrmv_info(new csrmv_info_struct); - rocsparse_csrmv_info info = unique_ptr_csrmv_info->info; + std::unique_ptr unique_ptr_mat_info(new mat_info_struct); + rocsparse_mat_info info = unique_ptr_mat_info->info; auto dptr_managed = rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; @@ -87,7 +87,7 @@ void testing_csrmv_bad_arg(void) } // testing for(nullptr == info) { - rocsparse_csrmv_info info_null = nullptr; + rocsparse_mat_info info_null = nullptr; status = rocsparse_csrmv_analysis( handle, transA, m, n, nnz, descr, dptr, dcol, info_null); @@ -176,6 +176,23 @@ void testing_csrmv_bad_arg(void) handle_null, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy, nullptr); verify_rocsparse_status_invalid_handle(status); } + + // testing rocsparse_csrmv_analysis_clear + + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrmv_analysis_clear(handle, info_null); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrmv_analysis_clear(handle_null, info); + verify_rocsparse_status_invalid_handle(status); + } } template @@ -221,12 +238,12 @@ rocsparse_status testing_csrmv(Arguments argus) std::unique_ptr test_descr(new descr_struct); rocsparse_mat_descr descr = test_descr->descr; - std::unique_ptr unique_ptr_csrmv_info(new csrmv_info_struct); - rocsparse_csrmv_info info = nullptr; + std::unique_ptr unique_ptr_mat_info(new mat_info_struct); + rocsparse_mat_info info = nullptr; if(adaptive) { - info = unique_ptr_csrmv_info->info; + info = unique_ptr_mat_info->info; } // Set matrix index base @@ -292,6 +309,11 @@ rocsparse_status testing_csrmv(Arguments argus) verify_rocsparse_status_success(status, "m >= 0 && n >= 0 && nnz >= 0"); } + if(adaptive) + { + CHECK_ROCSPARSE_ERROR(rocsparse_csrmv_analysis_clear(handle, info)); + } + return rocsparse_status_success; } @@ -577,6 +599,12 @@ rocsparse_status testing_csrmv(Arguments argus) bandwidth, gpu_time_used); } + + if(adaptive) + { + CHECK_ROCSPARSE_ERROR(rocsparse_csrmv_analysis_clear(handle, info)); + } + return rocsparse_status_success; } diff --git a/clients/tests/test_csrmv.cpp b/clients/tests/test_csrmv.cpp index a4356d13..e8ccdef8 100644 --- a/clients/tests/test_csrmv.cpp +++ b/clients/tests/test_csrmv.cpp @@ -66,7 +66,7 @@ Arguments setup_csrmv_arguments(csrmv_tuple tup) arg.alpha = std::get<2>(tup); arg.beta = std::get<3>(tup); arg.idx_base = std::get<4>(tup); - arg.bswitch = std::get<6>(tup); + arg.bswitch = std::get<5>(tup); arg.timing = 0; return arg; } diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 7e9d41da..45267a12 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -569,6 +569,28 @@ rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, const rocsparse_int* csr_col_ind, rocsparse_mat_info info); + +/*! \brief SPARSE Level 2 API + + \details + csrmv_analysis_clear frees all memory that was allocated by csrmv_analysis. This is + especially useful, if memory is an issue and the analysis data is not required for + further computation, e.g. when switching to another sparse matrix format. + Calling csrmv_analysis_clear is optional. All allocated resources will be cleared, + when rocsparse_destroy_mat_info is called. + + @param[in] + handle rocsparse_handle. + handle to the rocsparse library context queue. + @param[inout] + info structure that holds the information collected during + the analysis phase. + + ********************************************************************/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrmv_analysis_clear(rocsparse_handle handle, + rocsparse_mat_info info); + /*! \brief SPARSE Level 2 API \details diff --git a/library/src/handle.cpp b/library/src/handle.cpp index a4f3a78a..dd2ff277 100644 --- a/library/src/handle.cpp +++ b/library/src/handle.cpp @@ -122,15 +122,15 @@ rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info) return rocsparse_status_success; } + // Clean up row blocks + if(info->size > 0) + { + RETURN_IF_HIP_ERROR(hipFree(info->row_blocks)); + } + // Destruct try { - // Clean up row blocks - if(info->row_blocks != nullptr) - { - hipFree(info->row_blocks); - } - delete info; } catch(const rocsparse_status& status) From 1fadc075fe2e07a801e34fd0e3570b1544a6b915 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 11:43:16 +0200 Subject: [PATCH 217/304] built flag for csrmv info struct --- library/src/include/handle.h | 6 ++++-- library/src/level2/rocsparse_csrmv.hpp | 8 ++------ library/src/rocsparse_auxiliary.cpp | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/library/src/include/handle.h b/library/src/include/handle.h index 73283297..8d2bf6ab 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -112,6 +112,10 @@ struct _rocsparse_hyb_mat *******************************************************************************/ struct _rocsparse_mat_info { + // built flags + bool csrmv_built = false; + + // info structs rocsparse_csrmv_info csrmv_info = nullptr; }; @@ -125,8 +129,6 @@ struct _rocsparse_mat_info *******************************************************************************/ struct _rocsparse_csrmv_info { - // built flag - bool built = false; // num row blocks size_t size = 0; // row blocks diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index 4018c3b4..9ea5b824 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -220,7 +220,7 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, // If csrmv info is not available, call csrmv general return rocsparse_csrmv_general_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } - else if(info->csrmv_info == nullptr) + else if(info->csrmv_built == false) { // If csrmv info is not available, call csrmv general return rocsparse_csrmv_general_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); @@ -691,11 +691,7 @@ rocsparse_status rocsparse_csrmv_adaptive_template(rocsparse_handle handle, const rocsparse_csrmv_info info) { // Check if info matches current matrix and options - if(info->built == false) - { - return rocsparse_status_invalid_value; - } - else if(info->trans != trans) + if(info->trans != trans) { return rocsparse_status_invalid_value; } diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index 4f4520e5..fb3490fe 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -359,7 +359,7 @@ rocsparse_status rocsparse_destroy_mat_info(rocsparse_mat_info info) } // Clear csrmv info struct - if(info->csrmv_info != nullptr) + if(info->csrmv_built == true) { RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrmv_info(info->csrmv_info)); } From 610de0e787f1cf4be305eea934d8e01ca6c3c621 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 11:43:42 +0200 Subject: [PATCH 218/304] csrmv changes to work with rocsparse_int == int64_t --- library/src/level2/csrmv_device.h | 123 ++++++++++++++--------- library/src/level2/rocsparse_csrmv.cpp | 129 ++++++++++++++++--------- 2 files changed, 164 insertions(+), 88 deletions(-) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 85085b47..f61642f0 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -10,9 +10,9 @@ __device__ int __llvm_amdgcn_readlane(int index, int offset) __asm("llvm.amdgcn. #endif #if defined(__HIP_PLATFORM_HCC__) -// Swizzle-based float reduction +// Swizzle-based float wavefront reduction template -__device__ float reduction(float sum) +__device__ float wf_reduce(float sum) { typedef union flt_b32 { @@ -64,9 +64,9 @@ __device__ float reduction(float sum) return sum; } -// Swizzle-based double reduction +// Swizzle-based double wavefront reduction template -__device__ double reduction(double sum) +__device__ double wf_reduce(double sum) { typedef union dbl_b32 { @@ -125,7 +125,7 @@ __device__ double reduction(double sum) } #elif defined(__HIP_PLATFORM_NVCC__) template -__device__ T reduction(T sum) +__device__ T wf_reduce(T sum) { for(rocsparse_int i = SUBWAVE_SIZE >> 1; i > 0; i >>= 1) { @@ -168,7 +168,7 @@ static __device__ void csrmvn_general_device(rocsparse_int m, } // Obtain row sum using parallel reduction - sum = reduction(sum); + sum = wf_reduce(sum); // First thread of each subwave writes result into global memory if(lid == 0) @@ -211,9 +211,26 @@ __device__ static __inline__ void atomic_add(double *address, double val) while(atomicCAS((unsigned long long*)address, prevVal, newVal) != prevVal); } +// rocsparse_int == int32_t +__device__ static __inline__ int32_t mul24(int32_t x, int32_t y) +{ + return ((x << 8) >> 8) * ((y << 8) >> 8); +} + +// rocsparse_int == int64_t +__device__ static __inline__ int64_t mul24(int64_t x, int64_t y) +{ + return ((x << 40) >> 40) * ((y << 40) >> 40); +} + +__device__ static __inline__ rocsparse_int mad24(rocsparse_int x, rocsparse_int y, rocsparse_int z) +{ + return mul24(x, y) + z; +} + template static inline __device__ T -sum2_reduce(T cur_sum, T* partial, int lid, int max_size, int reduc_size) +sum2_reduce(T cur_sum, T* partial, rocsparse_int lid, rocsparse_int max_size, rocsparse_int reduc_size) { if(max_size > reduc_size) { @@ -242,8 +259,8 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, rocsparse_index_base idx_base) { __shared__ T partialSums[BLOCKSIZE]; - unsigned int gid = hipBlockIdx_x; - unsigned int lid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x; + rocsparse_int lid = hipThreadIdx_x; // The row blocks buffer holds a packed set of information used to inform each // workgroup about how to do its work: @@ -266,20 +283,20 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // know when the first workgroup for that row has finished initializing the output // value. While this bit is the same as the first workgroup's flag bit, this // workgroup will spin-loop. - unsigned int row = ((row_blocks[gid] >> (64 - ROW_BITS)) & ((1ULL << ROW_BITS) - 1ULL)); - unsigned int stop_row = + rocsparse_int row = ((row_blocks[gid] >> (64 - ROW_BITS)) & ((1ULL << ROW_BITS) - 1ULL)); + rocsparse_int stop_row = ((row_blocks[gid + 1] >> (64 - ROW_BITS)) & ((1ULL << ROW_BITS) - 1ULL)); - unsigned int num_rows = stop_row - row; + rocsparse_int num_rows = stop_row - row; // Get the workgroup within this long row ID out of the bottom bits of the row block. - unsigned int wg = row_blocks[gid] & ((1 << WG_BITS) - 1); + rocsparse_int wg = row_blocks[gid] & ((1 << WG_BITS) - 1); // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row. // If there are more items in this row, we assign more workgroups. - unsigned int vecStart = hc::__mad24(wg, (unsigned int)(BLOCK_MULTIPLIER * BLOCKSIZE), (unsigned int)(csr_row_ptr[row] - idx_base)); - unsigned int vecEnd = ((csr_row_ptr[row + 1] - idx_base) > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) - ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE - : (csr_row_ptr[row + 1] - idx_base); + rocsparse_int vecStart = mad24(wg, BLOCK_MULTIPLIER * BLOCKSIZE, csr_row_ptr[row] - idx_base); + rocsparse_int vecEnd = ((csr_row_ptr[row + 1] - idx_base) > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) + ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE + : (csr_row_ptr[row + 1] - idx_base); T temp_sum = 0.; @@ -313,15 +330,17 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // threads, 4 rows = 64 threads, 5 rows = 32 threads, etc. // int numThreadsForRed = get_local_size(0) >> ((CHAR_BIT*sizeof(unsigned // int))-clz(num_rows-1)); - unsigned int numThreadsForRed = wg; // Same calculation as above, done on host. + rocsparse_int numThreadsForRed = wg; // Same calculation as above, done on host. // Stream all of this row block's matrix values into local memory. // Perform the matvec in parallel with this work. - unsigned int col = csr_row_ptr[row] + lid - idx_base; + rocsparse_int col = csr_row_ptr[row] + lid - idx_base; if(gid != (gridDim.x - 1)) { - for(int i = 0; i < BLOCKSIZE; i += WG_SIZE) + for(rocsparse_int i = 0; i < BLOCKSIZE; i += WG_SIZE) + { partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i] - idx_base]; + } } else { @@ -332,8 +351,10 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // However, this may change in the future (e.g. with shared virtual memory.) // This causes a minor performance loss because this is the last workgroup // to be launched, and this loop can't be unrolled. - for(int i = 0; col + i < csr_row_ptr[stop_row] - idx_base; i += WG_SIZE) + for(rocsparse_int i = 0; col + i < csr_row_ptr[stop_row] - idx_base; i += WG_SIZE) + { partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i] - idx_base]; + } } __syncthreads(); @@ -350,10 +371,10 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // numThreadsForRed guaranteed to be a power of two, so the clz code below // avoids an integer divide. ~2% perf gain in EXTRA_PRECISION. // size_t st = lid/numThreadsForRed; - unsigned int local_row = row + (lid >> (31 - __clz(numThreadsForRed))); - unsigned int local_first_val = csr_row_ptr[local_row] - csr_row_ptr[row]; - unsigned int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; - unsigned int threadInBlock = lid & (numThreadsForRed - 1); + rocsparse_int local_row = row + (lid >> (31 - __clz(numThreadsForRed))); + rocsparse_int local_first_val = csr_row_ptr[local_row] - csr_row_ptr[row]; + rocsparse_int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; + rocsparse_int threadInBlock = lid & (numThreadsForRed - 1); // Not all row blocks are full -- they may have an odd number of rows. As such, // we need to ensure that adjacent-groups only work on real data for this rowBlock. @@ -362,10 +383,12 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // This is dangerous -- will infinite loop if your last value is within // numThreadsForRed of MAX_UINT. Noticable performance gain to avoid a // long induction variable here, though. - for(unsigned int local_cur_val = local_first_val + threadInBlock; + for(rocsparse_int local_cur_val = local_first_val + threadInBlock; local_cur_val < local_last_val; local_cur_val += numThreadsForRed) + { temp_sum += partialSums[local_cur_val]; + } } __syncthreads(); @@ -376,7 +399,7 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // LDS is full up to {workgroup size} entries. // Now we perform a parallel reduction that sums together the answers for each // row in parallel, leaving us an answer in 'temp_sum' for each row. - for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + for(rocsparse_int i = (WG_SIZE >> 1); i > 0; i >>= 1) { __syncthreads(); temp_sum = sum2_reduce(temp_sum, partialSums, lid, numThreadsForRed, i); @@ -400,20 +423,25 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // However, this reduction is also much faster than CSR-Scalar, because local memory // is designed for scatter-gather operations. // We need a while loop because there may be more rows than threads in the WG. - unsigned int local_row = row + lid; + rocsparse_int local_row = row + lid; while(local_row < stop_row) { - int local_first_val = (csr_row_ptr[local_row] - csr_row_ptr[row]); - int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; + rocsparse_int local_first_val = (csr_row_ptr[local_row] - csr_row_ptr[row]); + rocsparse_int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; temp_sum = 0.; - for(int local_cur_val = local_first_val; local_cur_val < local_last_val; + for(rocsparse_int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++) + { temp_sum += partialSums[local_cur_val]; + } // After you've done the reduction into the temp_sum register, // put that into the output for each row. if(beta != 0.) + { temp_sum += beta * y[local_row]; + } + y[local_row] = temp_sum; local_row += WG_SIZE; } @@ -445,26 +473,29 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // things. for(unsigned long long j = vecStart + lid; j < vecEnd; j += WG_SIZE) { - unsigned int col = csr_col_ind[(unsigned int)j] - idx_base; + rocsparse_int col = csr_col_ind[(unsigned int)j] - idx_base; temp_sum += alpha * csr_val[(unsigned int)j] * x[col]; } partialSums[lid] = temp_sum; // Reduce partial sums - for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + for(rocsparse_int i = (WG_SIZE >> 1); i > 0; i >>= 1) { __syncthreads(); temp_sum = sum2_reduce(temp_sum, partialSums, lid, WG_SIZE, i); } - if(lid == 0U) + if(lid == 0) { if(beta != 0.) + { temp_sum += beta * y[row]; + } + y[row] = temp_sum; } - row++; + ++row; } } else @@ -482,11 +513,11 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // First, figure out which workgroup you are in the row. Bottom 24 bits. // You can use that to find the global ID for the first workgroup calculating // this long row. - unsigned int first_wg_in_row = gid - (row_blocks[gid] & ((1ULL << WG_BITS) - 1ULL)); - unsigned int compare_value = row_blocks[gid] & (1ULL << WG_BITS); + rocsparse_int first_wg_in_row = gid - (row_blocks[gid] & ((1ULL << WG_BITS) - 1ULL)); + rocsparse_int compare_value = row_blocks[gid] & (1ULL << WG_BITS); // Bit 24 in the first workgroup is the flag that everyone waits on. - if(gid == first_wg_in_row && lid == 0ULL) + if(gid == first_wg_in_row && lid == 0) { // The first workgroup handles the output initialization. T out_val = y[row]; @@ -498,14 +529,14 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // The first workgroup will eventually flip this bit, and you can move forward. __syncthreads(); while( - gid != first_wg_in_row && lid == 0U && + gid != first_wg_in_row && lid == 0 && ((atomicMax(&row_blocks[first_wg_in_row], 0ULL) & (1ULL << WG_BITS)) == compare_value)) ; __syncthreads(); // After you've passed the barrier, update your local flag to make sure that // the next time through, you know what to wait on. - if(gid != first_wg_in_row && lid == 0ULL) + if(gid != first_wg_in_row && lid == 0) row_blocks[gid] ^= (1ULL << WG_BITS); // All but the final workgroup in a long-row collaboration have the same start_row @@ -513,13 +544,13 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // Load in a bunch of partial results into your register space, rather than LDS (no // contention) // Then dump the partially reduced answers into the LDS for inter-work-item reduction. - unsigned int col = vecStart + lid; + rocsparse_int col = vecStart + lid; if(row == stop_row) // inner thread, we can hardcode/unroll this loop { // Don't put BLOCK_MULTIPLIER*BLOCKSIZE as the stop point, because // some GPU compilers will *aggressively* unroll this loop. // That increases register pressure and reduces occupancy. - for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) + for(rocsparse_int j = 0; j < vecEnd - col; j += WG_SIZE) { temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j] - idx_base]; #if 2 * WG_SIZE <= BLOCK_MULTIPLIER * BLOCKSIZE @@ -531,20 +562,22 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, } else { - for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) + for(rocsparse_int j = 0; j < vecEnd - col; j += WG_SIZE) + { temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j] - idx_base]; + } } partialSums[lid] = temp_sum; // Reduce partial sums - for(int i = (WG_SIZE >> 1); i > 0; i >>= 1) + for(rocsparse_int i = (WG_SIZE >> 1); i > 0; i >>= 1) { __syncthreads(); temp_sum = sum2_reduce(temp_sum, partialSums, lid, WG_SIZE, i); } - if(lid == 0U) + if(lid == 0) { atomic_add(&y[row], temp_sum); } diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 01dabcdb..2c048288 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -49,33 +49,37 @@ static unsigned long long numThreadsForReduction(unsigned long long num_rows) static void ComputeRowBlocks(unsigned long long* rowBlocks, size_t& rowBlockSize, - const int* rowDelimiters, - int nRows, + const rocsparse_int* rowDelimiters, + rocsparse_int nRows, bool allocate_row_blocks = true) { unsigned long long* rowBlocksBase; - int total_row_blocks = 1; // Start at one because of rowBlock[0] + + // Start at one because of rowBlock[0] + rocsparse_int total_row_blocks = 1; if(allocate_row_blocks) { rowBlocksBase = rowBlocks; *rowBlocks = 0; - rowBlocks++; + ++rowBlocks; } + unsigned long long sum = 0; - unsigned long long i, last_i = 0; + unsigned long long i; + unsigned long long last_i = 0; // Check to ensure nRows can fit in 32 bits - if((unsigned long long)nRows > (unsigned long long)std::pow(2, ROW_BITS)) + if(static_cast(nRows) > static_cast(std::pow(2, ROW_BITS))) { fprintf(stderr, "nrow does not fit in 32 bits\n"); exit(1); } - int consecutive_long_rows = 0; - for(i = 1; i <= (unsigned long long)nRows; i++) + rocsparse_int consecutive_long_rows = 0; + for(i = 1; i <= static_cast(nRows); ++i) { - int row_length = (rowDelimiters[i] - rowDelimiters[i - 1]); + rocsparse_int row_length = (rowDelimiters[i] - rowDelimiters[i - 1]); sum += row_length; // The following section of code calculates whether you're moving between @@ -84,14 +88,20 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, // roughly the same length. Long rows can be reduced horizontally. // Short rows can be reduced one-thread-per-row. Try not to mix them. if(row_length > 128) - consecutive_long_rows++; + { + ++consecutive_long_rows; + } else if(consecutive_long_rows > 0) { // If it turns out we WERE in a long-row region, cut if off now. if(row_length < 32) // Now we're in a short-row region + { consecutive_long_rows = -1; + } else + { consecutive_long_rows++; + } } // If you just entered into a "long" row from a series of short rows, @@ -105,14 +115,19 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, if(allocate_row_blocks) { *rowBlocks = ((i - 1) << (64 - ROW_BITS)); + // If this row fits into CSR-Stream, calculate how many rows // can be used to do a parallel reduction. // Fill in the low-order bits with the numThreadsForRed - if(((i - 1) - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + if(((i - 1) - last_i) > static_cast(ROWS_FOR_VECTOR)) + { *(rowBlocks - 1) |= numThreadsForReduction((i - 1) - last_i); - rowBlocks++; + } + + ++rowBlocks; } - total_row_blocks++; + + ++total_row_blocks; last_i = i - 1; sum = row_length; } @@ -124,11 +139,15 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, if(allocate_row_blocks) { *rowBlocks = ((i - 1) << (64 - ROW_BITS)); - if(((i - 1) - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + if(((i - 1) - last_i) > static_cast(ROWS_FOR_VECTOR)) + { *(rowBlocks - 1) |= numThreadsForReduction((i - 1) - last_i); - rowBlocks++; + } + + ++rowBlocks; } - total_row_blocks++; + + ++total_row_blocks; last_i = i - 1; sum = row_length; consecutive_long_rows = 0; @@ -138,26 +157,27 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, // exactly one row results in non-zero elements to be greater than blockSize // This is csr-vector case; bottom WGBITS == workgroup ID - if((i - last_i == 1) && sum > (unsigned long long)BLOCKSIZE) + if((i - last_i == 1) && sum > static_cast(BLOCKSIZE)) { - int numWGReq = - static_cast(std::ceil((double)row_length / (BLOCK_MULTIPLIER * BLOCKSIZE))); + rocsparse_int numWGReq = static_cast(std::ceil(static_cast(row_length) / (BLOCK_MULTIPLIER * BLOCKSIZE))); // Check to ensure #workgroups can fit in WGBITS bits, if not // then the last workgroup will do all the remaining work - numWGReq = (numWGReq < (int)std::pow(2, WG_BITS)) ? numWGReq : (int)std::pow(2, WG_BITS); + numWGReq = (numWGReq < static_cast(std::pow(2, WG_BITS))) ? numWGReq : static_cast(std::pow(2, WG_BITS)); if(allocate_row_blocks) { - for(int w = 1; w < numWGReq; w++) + for(rocsparse_int w = 1; w < numWGReq; ++w) { *rowBlocks = ((i - 1) << (64 - ROW_BITS)); *rowBlocks |= static_cast(w); - rowBlocks++; + ++rowBlocks; } + *rowBlocks = (i << (64 - ROW_BITS)); - rowBlocks++; + ++rowBlocks; } + total_row_blocks += numWGReq; last_i = i; sum = 0; @@ -165,32 +185,42 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, } // more than one row results in non-zero elements to be greater than blockSize // This is csr-stream case; bottom WGBITS = number of parallel reduction threads - else if((i - last_i > 1) && sum > (unsigned long long)BLOCKSIZE) + else if((i - last_i > 1) && sum > static_cast(BLOCKSIZE)) { - i--; // This row won't fit, so back off one. + // This row won't fit, so back off one. + --i; + if(allocate_row_blocks) { *rowBlocks = (i << (64 - ROW_BITS)); - if((i - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + if((i - last_i) > static_cast(ROWS_FOR_VECTOR)) + { *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); - rowBlocks++; + } + + ++rowBlocks; } - total_row_blocks++; + + ++total_row_blocks; last_i = i; sum = 0; consecutive_long_rows = 0; } // This is csr-stream case; bottom WGBITS = number of parallel reduction threads - else if(sum == (unsigned long long)BLOCKSIZE) + else if(sum == static_cast(BLOCKSIZE)) { if(allocate_row_blocks) { *rowBlocks = (i << (64 - ROW_BITS)); - if((i - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + if((i - last_i) > static_cast(ROWS_FOR_VECTOR)) + { *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); - rowBlocks++; + } + + ++rowBlocks; } - total_row_blocks++; + + ++total_row_blocks; last_i = i; sum = 0; consecutive_long_rows = 0; @@ -201,11 +231,15 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, if(allocate_row_blocks && (*(rowBlocks - 1) >> (64 - ROW_BITS)) != static_cast(nRows)) { *rowBlocks = (static_cast(nRows) << (64 - ROW_BITS)); - if((nRows - last_i) > (unsigned long long)ROWS_FOR_VECTOR) + if((nRows - last_i) > static_cast(ROWS_FOR_VECTOR)) + { *(rowBlocks - 1) |= numThreadsForReduction(i - last_i); - rowBlocks++; + } + + ++rowBlocks; } - total_row_blocks++; + + ++total_row_blocks; if(allocate_row_blocks) { @@ -217,7 +251,9 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, rowBlockSize = 2 * dist; } else + { rowBlockSize = 2 * total_row_blocks; + } } /* @@ -250,7 +286,7 @@ extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_csrmv_analysis", trans, @@ -328,11 +364,14 @@ extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, m, true); - // Allocate memory on device to hold csrmv info - RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->csrmv_info->row_blocks, sizeof(unsigned long long) * info->csrmv_info->size)); + // Allocate memory on device to hold csrmv info, if required + if(info->csrmv_info->size > 0) + { + RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->csrmv_info->row_blocks, sizeof(unsigned long long) * info->csrmv_info->size)); - // Copy row blocks information to device - RETURN_IF_HIP_ERROR(hipMemcpy(info->csrmv_info->row_blocks, row_blocks.data(), sizeof(unsigned long long) * info->csrmv_info->size, hipMemcpyHostToDevice)); + // Copy row blocks information to device + RETURN_IF_HIP_ERROR(hipMemcpy(info->csrmv_info->row_blocks, row_blocks.data(), sizeof(unsigned long long) * info->csrmv_info->size, hipMemcpyHostToDevice)); + } // Store some pointers to verify correct execution info->csrmv_info->trans = trans; @@ -344,7 +383,7 @@ extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, info->csrmv_info->csr_col_ind = csr_col_ind; // Set built flag - info->csrmv_info->built = true; + info->csrmv_built = true; return rocsparse_status_success; } @@ -362,12 +401,16 @@ extern "C" rocsparse_status rocsparse_csrmv_analysis_clear(rocsparse_handle hand return rocsparse_status_invalid_pointer; } - // Logging TODO bench logging + // Logging log_trace(handle, "rocsparse_csrmv_analysis_clear", (const void*&)info); - return rocsparse_destroy_csrmv_info(info->csrmv_info); + // Destroy csrmv info struct + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrmv_info(info->csrmv_info)); + info->csrmv_built = false; + + return rocsparse_status_success; } extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, From 618a708e32a54d980b8b5d0f43dde5b15a713d92 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 11:47:26 +0200 Subject: [PATCH 219/304] clang-format --- .../rocsparse_template_specialization.cpp | 32 ++- clients/include/testing_coomv.hpp | 14 +- clients/include/testing_csrmv.hpp | 232 ++++++++++++++---- clients/include/testing_ellmv.hpp | 14 +- clients/include/testing_hybmv.hpp | 8 +- clients/include/utility.hpp | 2 +- clients/samples/example_coomv.cpp | 10 +- clients/samples/example_csrmv.cpp | 10 +- clients/samples/example_ellmv.cpp | 10 +- clients/samples/example_hybmv.cpp | 10 +- library/include/rocsparse-functions.h | 4 +- library/src/conversion/rocsparse_coo2csr.cpp | 4 +- library/src/conversion/rocsparse_coosort.cpp | 4 +- library/src/conversion/rocsparse_csr2coo.cpp | 4 +- library/src/conversion/rocsparse_csr2csc.hpp | 5 +- library/src/conversion/rocsparse_csr2ell.hpp | 5 +- library/src/conversion/rocsparse_csr2hyb.hpp | 5 +- library/src/conversion/rocsparse_csrsort.cpp | 4 +- library/src/conversion/rocsparse_ell2csr.hpp | 5 +- library/src/conversion/rocsparse_identity.cpp | 5 +- library/src/include/handle.h | 2 - library/src/level1/rocsparse_doti.hpp | 5 +- library/src/level1/rocsparse_gthr.hpp | 5 +- library/src/level1/rocsparse_gthrz.hpp | 5 +- library/src/level1/rocsparse_sctr.hpp | 5 +- library/src/level2/csrmv_device.h | 39 +-- library/src/level2/rocsparse_csrmv.cpp | 79 ++++-- library/src/level2/rocsparse_csrmv.hpp | 77 ++++-- 28 files changed, 388 insertions(+), 216 deletions(-) diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index a8c7320c..ef9e2d39 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -202,8 +202,20 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, float* y, const rocsparse_mat_info info) { - return rocsparse_scsrmv( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); + return rocsparse_scsrmv(handle, + trans, + m, + n, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + x, + beta, + y, + info); } template <> @@ -222,8 +234,20 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, double* y, const rocsparse_mat_info info) { - return rocsparse_dcsrmv( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); + return rocsparse_dcsrmv(handle, + trans, + m, + n, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + x, + beta, + y, + info); } template <> diff --git a/clients/include/testing_coomv.hpp b/clients/include/testing_coomv.hpp index 5fc2e1a2..dd8ea2fd 100644 --- a/clients/include/testing_coomv.hpp +++ b/clients/include/testing_coomv.hpp @@ -20,12 +20,12 @@ using namespace rocsparse_test; template void testing_coomv_bad_arg(void) { - rocsparse_int n = 100; - rocsparse_int m = 100; - rocsparse_int nnz = 100; - rocsparse_int safe_size = 100; - T alpha = 0.6; - T beta = 0.2; + rocsparse_int n = 100; + rocsparse_int m = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + T alpha = 0.6; + T beta = 0.2; rocsparse_operation transA = rocsparse_operation_none; rocsparse_status status; @@ -137,7 +137,7 @@ rocsparse_status testing_coomv(Arguments argus) rocsparse_int n = argus.N; T h_alpha = argus.alpha; T h_beta = argus.beta; - rocsparse_operation transA = argus.transA; + rocsparse_operation transA = argus.transA; rocsparse_index_base idx_base = argus.idx_base; std::string binfile = ""; std::string filename = ""; diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index c9bdc75f..49e7a64d 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -21,12 +21,12 @@ using namespace rocsparse_test; template void testing_csrmv_bad_arg(void) { - rocsparse_int n = 100; - rocsparse_int m = 100; - rocsparse_int nnz = 100; - rocsparse_int safe_size = 100; - T alpha = 0.6; - T beta = 0.2; + rocsparse_int n = 100; + rocsparse_int m = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + T alpha = 0.6; + T beta = 0.2; rocsparse_operation transA = rocsparse_operation_none; rocsparse_status status; @@ -65,40 +65,35 @@ void testing_csrmv_bad_arg(void) { rocsparse_int* dptr_null = nullptr; - status = rocsparse_csrmv_analysis( - handle, transA, m, n, nnz, descr, dptr_null, dcol, info); + status = rocsparse_csrmv_analysis(handle, transA, m, n, nnz, descr, dptr_null, dcol, info); verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); } // testing for(nullptr == dcol) { rocsparse_int* dcol_null = nullptr; - status = rocsparse_csrmv_analysis( - handle, transA, m, n, nnz, descr, dptr, dcol_null, info); + status = rocsparse_csrmv_analysis(handle, transA, m, n, nnz, descr, dptr, dcol_null, info); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } // testing for(nullptr == descr) { rocsparse_mat_descr descr_null = nullptr; - status = rocsparse_csrmv_analysis( - handle, transA, m, n, nnz, descr_null, dptr, dcol, info); + status = rocsparse_csrmv_analysis(handle, transA, m, n, nnz, descr_null, dptr, dcol, info); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == info) { rocsparse_mat_info info_null = nullptr; - status = rocsparse_csrmv_analysis( - handle, transA, m, n, nnz, descr, dptr, dcol, info_null); + status = rocsparse_csrmv_analysis(handle, transA, m, n, nnz, descr, dptr, dcol, info_null); verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); } // testing for(nullptr == handle) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csrmv_analysis( - handle_null, transA, m, n, nnz, descr, dptr, dcol, info); + status = rocsparse_csrmv_analysis(handle_null, transA, m, n, nnz, descr, dptr, dcol, info); verify_rocsparse_status_invalid_handle(status); } @@ -108,72 +103,180 @@ void testing_csrmv_bad_arg(void) { rocsparse_int* dptr_null = nullptr; - status = rocsparse_csrmv( - handle, transA, m, n, nnz, &alpha, descr, dval, dptr_null, dcol, dx, &beta, dy, nullptr); + status = rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + &alpha, + descr, + dval, + dptr_null, + dcol, + dx, + &beta, + dy, + nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); } // testing for(nullptr == dcol) { rocsparse_int* dcol_null = nullptr; - status = rocsparse_csrmv( - handle, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol_null, dx, &beta, dy, nullptr); + status = rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + &alpha, + descr, + dval, + dptr, + dcol_null, + dx, + &beta, + dy, + nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } // testing for(nullptr == dval) { T* dval_null = nullptr; - status = rocsparse_csrmv( - handle, transA, m, n, nnz, &alpha, descr, dval_null, dptr, dcol, dx, &beta, dy, nullptr); + status = rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + &alpha, + descr, + dval_null, + dptr, + dcol, + dx, + &beta, + dy, + nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); } // testing for(nullptr == dx) { T* dx_null = nullptr; - status = rocsparse_csrmv( - handle, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx_null, &beta, dy, nullptr); + status = rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + &alpha, + descr, + dval, + dptr, + dcol, + dx_null, + &beta, + dy, + nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); } // testing for(nullptr == dy) { T* dy_null = nullptr; - status = rocsparse_csrmv( - handle, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy_null, nullptr); + status = rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + &alpha, + descr, + dval, + dptr, + dcol, + dx, + &beta, + dy_null, + nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); } // testing for(nullptr == d_alpha) { T* d_alpha_null = nullptr; - status = rocsparse_csrmv( - handle, transA, m, n, nnz, d_alpha_null, descr, dval, dptr, dcol, dx, &beta, dy, nullptr); + status = rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + d_alpha_null, + descr, + dval, + dptr, + dcol, + dx, + &beta, + dy, + nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for(nullptr == d_beta) { T* d_beta_null = nullptr; - status = rocsparse_csrmv( - handle, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, d_beta_null, dy, nullptr); + status = rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + &alpha, + descr, + dval, + dptr, + dcol, + dx, + d_beta_null, + dy, + nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } // testing for(nullptr == descr) { rocsparse_mat_descr descr_null = nullptr; - status = rocsparse_csrmv( - handle, transA, m, n, nnz, &alpha, descr_null, dval, dptr, dcol, dx, &beta, dy, nullptr); + status = rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + &alpha, + descr_null, + dval, + dptr, + dcol, + dx, + &beta, + dy, + nullptr); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == handle) { rocsparse_handle handle_null = nullptr; - status = rocsparse_csrmv( - handle_null, transA, m, n, nnz, &alpha, descr, dval, dptr, dcol, dx, &beta, dy, nullptr); + status = rocsparse_csrmv(handle_null, + transA, + m, + n, + nnz, + &alpha, + descr, + dval, + dptr, + dcol, + dx, + &beta, + dy, + nullptr); verify_rocsparse_status_invalid_handle(status); } @@ -199,7 +302,7 @@ template static T two_sum(T x, T y, T* sumk_err) { T sumk_s = x + y; - T bp = sumk_s - x; + T bp = sumk_s - x; (*sumk_err) += ((x - (sumk_s - bp)) + (y - bp)); return sumk_s; } @@ -212,7 +315,7 @@ rocsparse_status testing_csrmv(Arguments argus) rocsparse_int n = argus.N; T h_alpha = argus.alpha; T h_beta = argus.beta; - rocsparse_operation transA = argus.transA; + rocsparse_operation transA = argus.transA; rocsparse_index_base idx_base = argus.idx_base; bool adaptive = argus.bswitch; std::string binfile = ""; @@ -426,7 +529,8 @@ rocsparse_status testing_csrmv(Arguments argus) if(adaptive) { // csrmv analysis - CHECK_ROCSPARSE_ERROR(rocsparse_csrmv_analysis(handle, transA, m, n, nnz, descr, dptr, dcol, info)); + CHECK_ROCSPARSE_ERROR( + rocsparse_csrmv_analysis(handle, transA, m, n, nnz, descr, dptr, dcol, info)); } if(argus.unit_check) @@ -459,7 +563,9 @@ rocsparse_status testing_csrmv(Arguments argus) T sum = hy_gold[i]; T err = static_cast(0); - for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; ++j) + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; + j < hcsr_row_ptr[i + 1] - idx_base; + ++j) { sum = two_sum(sum, h_alpha * hval[j] * hx[hcol_ind[j] - idx_base], &err); } @@ -472,10 +578,10 @@ rocsparse_status testing_csrmv(Arguments argus) // Query for warpSize hipDeviceProp_t prop; hipGetDeviceProperties(&prop, 0); - + rocsparse_int WF_SIZE; rocsparse_int nnz_per_row = nnz / m; - + if(prop.warpSize == 32) { if(nnz_per_row < 4) @@ -508,23 +614,25 @@ rocsparse_status testing_csrmv(Arguments argus) { return rocsparse_status_internal_error; } - + for(rocsparse_int i = 0; i < m; ++i) { std::vector sum(WF_SIZE, 0.0); - - for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; j < hcsr_row_ptr[i + 1] - idx_base; + + for(rocsparse_int j = hcsr_row_ptr[i] - idx_base; + j < hcsr_row_ptr[i + 1] - idx_base; j += WF_SIZE) { for(rocsparse_int k = 0; k < WF_SIZE; ++k) { if(j + k < hcsr_row_ptr[i + 1] - idx_base) { - sum[k] = fma(h_alpha * hval[j + k], hx[hcol_ind[j + k] - idx_base], sum[k]); + sum[k] = + fma(h_alpha * hval[j + k], hx[hcol_ind[j + k] - idx_base], sum[k]); } } } - + for(rocsparse_int j = 1; j < WF_SIZE; j <<= 1) { for(rocsparse_int k = 0; k < WF_SIZE - j; ++k) @@ -532,7 +640,7 @@ rocsparse_status testing_csrmv(Arguments argus) sum[k] += sum[k + j]; } } - + if(h_beta == 0.0) { hy_gold[i] = sum[0]; @@ -566,16 +674,40 @@ rocsparse_status testing_csrmv(Arguments argus) for(int iter = 0; iter < number_cold_calls; iter++) { - rocsparse_csrmv( - handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1, info); + rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + dx, + &h_beta, + dy_1, + info); } double gpu_time_used = get_time_us(); // in microseconds for(int iter = 0; iter < number_hot_calls; iter++) { - rocsparse_csrmv( - handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1, info); + rocsparse_csrmv(handle, + transA, + m, + n, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + dx, + &h_beta, + dy_1, + info); } // Convert to miliseconds per call diff --git a/clients/include/testing_ellmv.hpp b/clients/include/testing_ellmv.hpp index 81b50167..59f7ab5e 100644 --- a/clients/include/testing_ellmv.hpp +++ b/clients/include/testing_ellmv.hpp @@ -24,12 +24,12 @@ using namespace rocsparse_test; template void testing_ellmv_bad_arg(void) { - rocsparse_int n = 100; - rocsparse_int m = 100; - rocsparse_int safe_size = 100; - rocsparse_int ell_width = 8; - T alpha = 0.6; - T beta = 0.2; + rocsparse_int n = 100; + rocsparse_int m = 100; + rocsparse_int safe_size = 100; + rocsparse_int ell_width = 8; + T alpha = 0.6; + T beta = 0.2; rocsparse_operation transA = rocsparse_operation_none; rocsparse_status status; @@ -130,7 +130,7 @@ rocsparse_status testing_ellmv(Arguments argus) rocsparse_int n = argus.N; T h_alpha = argus.alpha; T h_beta = argus.beta; - rocsparse_operation transA = argus.transA; + rocsparse_operation transA = argus.transA; rocsparse_index_base idx_base = argus.idx_base; std::string binfile = ""; std::string filename = ""; diff --git a/clients/include/testing_hybmv.hpp b/clients/include/testing_hybmv.hpp index 8979c812..3e0fec6e 100644 --- a/clients/include/testing_hybmv.hpp +++ b/clients/include/testing_hybmv.hpp @@ -39,9 +39,9 @@ struct testhyb template void testing_hybmv_bad_arg(void) { - rocsparse_int safe_size = 100; - T alpha = 0.6; - T beta = 0.2; + rocsparse_int safe_size = 100; + T alpha = 0.6; + T beta = 0.2; rocsparse_operation transA = rocsparse_operation_none; rocsparse_status status; @@ -125,7 +125,7 @@ rocsparse_status testing_hybmv(Arguments argus) rocsparse_int n = argus.N; T h_alpha = argus.alpha; T h_beta = argus.beta; - rocsparse_operation transA = argus.transA; + rocsparse_operation transA = argus.transA; rocsparse_index_base idx_base = argus.idx_base; rocsparse_hyb_partition part = argus.part; rocsparse_int user_ell_width = argus.ell_width; diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index c330e913..f3af6315 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -643,7 +643,7 @@ class Arguments rocsparse_int temp = 0; std::string filename = ""; - bool bswitch = false; + bool bswitch = false; Arguments& operator=(const Arguments& rhs) { diff --git a/clients/samples/example_coomv.cpp b/clients/samples/example_coomv.cpp index 9f8f6284..19ad76a4 100644 --- a/clients/samples/example_coomv.cpp +++ b/clients/samples/example_coomv.cpp @@ -75,11 +75,11 @@ int main(int argc, char* argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - rocsparse_int* dArow = NULL; - rocsparse_int* dAcol = NULL; - double* dAval = NULL; - double* dx = NULL; - double* dy = NULL; + rocsparse_int* dArow = NULL; + rocsparse_int* dAcol = NULL; + double* dAval = NULL; + double* dx = NULL; + double* dy = NULL; hipMalloc((void**)&dArow, sizeof(rocsparse_int) * nnz); hipMalloc((void**)&dAcol, sizeof(rocsparse_int) * nnz); diff --git a/clients/samples/example_csrmv.cpp b/clients/samples/example_csrmv.cpp index d58a916f..d8e9bbdb 100644 --- a/clients/samples/example_csrmv.cpp +++ b/clients/samples/example_csrmv.cpp @@ -64,11 +64,11 @@ int main(int argc, char* argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - rocsparse_int* dAptr = NULL; - rocsparse_int* dAcol = NULL; - double* dAval = NULL; - double* dx = NULL; - double* dy = NULL; + rocsparse_int* dAptr = NULL; + rocsparse_int* dAcol = NULL; + double* dAval = NULL; + double* dx = NULL; + double* dy = NULL; hipMalloc((void**)&dAptr, sizeof(rocsparse_int) * (m + 1)); hipMalloc((void**)&dAcol, sizeof(rocsparse_int) * nnz); diff --git a/clients/samples/example_ellmv.cpp b/clients/samples/example_ellmv.cpp index 8d49d837..c1a00108 100644 --- a/clients/samples/example_ellmv.cpp +++ b/clients/samples/example_ellmv.cpp @@ -64,11 +64,11 @@ int main(int argc, char* argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - rocsparse_int* dAptr = NULL; - rocsparse_int* dAcol = NULL; - double* dAval = NULL; - double* dx = NULL; - double* dy = NULL; + rocsparse_int* dAptr = NULL; + rocsparse_int* dAcol = NULL; + double* dAval = NULL; + double* dx = NULL; + double* dy = NULL; hipMalloc((void**)&dAptr, sizeof(rocsparse_int) * (m + 1)); hipMalloc((void**)&dAcol, sizeof(rocsparse_int) * nnz); diff --git a/clients/samples/example_hybmv.cpp b/clients/samples/example_hybmv.cpp index 27283d07..aff2deba 100644 --- a/clients/samples/example_hybmv.cpp +++ b/clients/samples/example_hybmv.cpp @@ -64,11 +64,11 @@ int main(int argc, char* argv[]) rocsparse_create_mat_descr(&descrA); // Offload data to device - rocsparse_int* dAptr = NULL; - rocsparse_int* dAcol = NULL; - double* dAval = NULL; - double* dx = NULL; - double* dy = NULL; + rocsparse_int* dAptr = NULL; + rocsparse_int* dAcol = NULL; + double* dAval = NULL; + double* dx = NULL; + double* dy = NULL; hipMalloc((void**)&dAptr, sizeof(rocsparse_int) * (m + 1)); hipMalloc((void**)&dAcol, sizeof(rocsparse_int) * nnz); diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 45267a12..1c2dbdb2 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -569,7 +569,6 @@ rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, const rocsparse_int* csr_col_ind, rocsparse_mat_info info); - /*! \brief SPARSE Level 2 API \details @@ -588,8 +587,7 @@ rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, ********************************************************************/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_csrmv_analysis_clear(rocsparse_handle handle, - rocsparse_mat_info info); +rocsparse_status rocsparse_csrmv_analysis_clear(rocsparse_handle handle, rocsparse_mat_info info); /*! \brief SPARSE Level 2 API diff --git a/library/src/conversion/rocsparse_coo2csr.cpp b/library/src/conversion/rocsparse_coo2csr.cpp index e934aaad..1eb902d4 100644 --- a/library/src/conversion/rocsparse_coo2csr.cpp +++ b/library/src/conversion/rocsparse_coo2csr.cpp @@ -31,9 +31,7 @@ extern "C" rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, (const void*&)csr_row_ptr, idx_base); - log_bench(handle, - "./rocsparse-bench -f coo2csr", - "--mtx "); + log_bench(handle, "./rocsparse-bench -f coo2csr", "--mtx "); // Check sizes if(nnz < 0) diff --git a/library/src/conversion/rocsparse_coosort.cpp b/library/src/conversion/rocsparse_coosort.cpp index b72ab1f0..94c5d4fa 100644 --- a/library/src/conversion/rocsparse_coosort.cpp +++ b/library/src/conversion/rocsparse_coosort.cpp @@ -146,9 +146,7 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, (const void*&)perm, (const void*&)temp_buffer); - log_bench(handle, - "./rocsparse-bench -f coosort", - "--mtx "); + log_bench(handle, "./rocsparse-bench -f coosort", "--mtx "); // Check sizes if(m < 0) diff --git a/library/src/conversion/rocsparse_csr2coo.cpp b/library/src/conversion/rocsparse_csr2coo.cpp index b3332066..a7de6980 100644 --- a/library/src/conversion/rocsparse_csr2coo.cpp +++ b/library/src/conversion/rocsparse_csr2coo.cpp @@ -31,9 +31,7 @@ extern "C" rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, (const void*&)coo_row_ind, idx_base); - log_bench(handle, - "./rocsparse-bench -f csr2coo ", - "--mtx "); + log_bench(handle, "./rocsparse-bench -f csr2coo ", "--mtx "); // Check sizes if(nnz < 0) diff --git a/library/src/conversion/rocsparse_csr2csc.hpp b/library/src/conversion/rocsparse_csr2csc.hpp index 3dd05d57..e34187df 100644 --- a/library/src/conversion/rocsparse_csr2csc.hpp +++ b/library/src/conversion/rocsparse_csr2csc.hpp @@ -52,10 +52,7 @@ rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, idx_base, (const void*&)temp_buffer); - log_bench(handle, - "./rocsparse-bench -f csr2csc -r", - replaceX("X"), - "--mtx "); + log_bench(handle, "./rocsparse-bench -f csr2csc -r", replaceX("X"), "--mtx "); // Check index base if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) diff --git a/library/src/conversion/rocsparse_csr2ell.hpp b/library/src/conversion/rocsparse_csr2ell.hpp index aeee6696..3e19d308 100644 --- a/library/src/conversion/rocsparse_csr2ell.hpp +++ b/library/src/conversion/rocsparse_csr2ell.hpp @@ -53,10 +53,7 @@ rocsparse_status rocsparse_csr2ell_template(rocsparse_handle handle, (const void*&)ell_val, (const void*&)ell_col_ind); - log_bench(handle, - "./rocsparse-bench -f csr2ell -r", - replaceX("X"), - "--mtx "); + log_bench(handle, "./rocsparse-bench -f csr2ell -r", replaceX("X"), "--mtx "); // Check index base if(csr_descr->base != rocsparse_index_base_zero && csr_descr->base != rocsparse_index_base_one) diff --git a/library/src/conversion/rocsparse_csr2hyb.hpp b/library/src/conversion/rocsparse_csr2hyb.hpp index 334a0b16..83fd91b3 100644 --- a/library/src/conversion/rocsparse_csr2hyb.hpp +++ b/library/src/conversion/rocsparse_csr2hyb.hpp @@ -55,10 +55,7 @@ rocsparse_status rocsparse_csr2hyb_template(rocsparse_handle handle, user_ell_width, partition_type); - log_bench(handle, - "./rocsparse-bench -f csr2hyb -r", - replaceX("X"), - "--mtx "); + log_bench(handle, "./rocsparse-bench -f csr2hyb -r", replaceX("X"), "--mtx "); // Check index base if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 9a573f37..5e8642bb 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -137,9 +137,7 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, (const void*&)perm, (const void*&)temp_buffer); - log_bench(handle, - "./rocsparse-bench -f csrsort", - "--mtx "); + log_bench(handle, "./rocsparse-bench -f csrsort", "--mtx "); // Check sizes if(m < 0) diff --git a/library/src/conversion/rocsparse_ell2csr.hpp b/library/src/conversion/rocsparse_ell2csr.hpp index ed2ba156..b5ba34ec 100644 --- a/library/src/conversion/rocsparse_ell2csr.hpp +++ b/library/src/conversion/rocsparse_ell2csr.hpp @@ -55,10 +55,7 @@ rocsparse_status rocsparse_ell2csr_template(rocsparse_handle handle, (const void*&)csr_row_ptr, (const void*&)csr_col_ind); - log_bench(handle, - "./rocsparse-bench -f ell2csr -r", - replaceX("X"), - "--mtx "); + log_bench(handle, "./rocsparse-bench -f ell2csr -r", replaceX("X"), "--mtx "); // Check index base if(ell_descr->base != rocsparse_index_base_zero && ell_descr->base != rocsparse_index_base_one) diff --git a/library/src/conversion/rocsparse_identity.cpp b/library/src/conversion/rocsparse_identity.cpp index 16f442cd..bba49c98 100644 --- a/library/src/conversion/rocsparse_identity.cpp +++ b/library/src/conversion/rocsparse_identity.cpp @@ -21,10 +21,7 @@ rocsparse_create_identity_permutation(rocsparse_handle handle, rocsparse_int n, // Logging log_trace(handle, "rocsparse_create_identity_permutation", n, (const void*&)p); - log_bench(handle, - "./rocsparse-bench -f identity", - "-n", - n); + log_bench(handle, "./rocsparse-bench -f identity", "-n", n); // Check sizes if(n < 0) diff --git a/library/src/include/handle.h b/library/src/include/handle.h index 8d2bf6ab..d43afcf1 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -119,8 +119,6 @@ struct _rocsparse_mat_info rocsparse_csrmv_info csrmv_info = nullptr; }; - - /******************************************************************************** * \brief rocsparse_csrmv_info is a structure holding the rocsparse csrmv info * data gathered during csrmv_analysis. It must be initialized using the diff --git a/library/src/level1/rocsparse_doti.hpp b/library/src/level1/rocsparse_doti.hpp index 1c5af9c6..a96ac195 100644 --- a/library/src/level1/rocsparse_doti.hpp +++ b/library/src/level1/rocsparse_doti.hpp @@ -41,10 +41,7 @@ rocsparse_status rocsparse_doti_template(rocsparse_handle handle, *result, idx_base); - log_bench(handle, - "./rocsparse-bench -f doti -r", - replaceX("X"), - "--mtx "); + log_bench(handle, "./rocsparse-bench -f doti -r", replaceX("X"), "--mtx "); } else { diff --git a/library/src/level1/rocsparse_gthr.hpp b/library/src/level1/rocsparse_gthr.hpp index 70d5d067..3a87c7b3 100644 --- a/library/src/level1/rocsparse_gthr.hpp +++ b/library/src/level1/rocsparse_gthr.hpp @@ -36,10 +36,7 @@ rocsparse_status rocsparse_gthr_template(rocsparse_handle handle, (const void*&)x_ind, idx_base); - log_bench(handle, - "./rocsparse-bench -f gthr -r", - replaceX("X"), - "--mtx "); + log_bench(handle, "./rocsparse-bench -f gthr -r", replaceX("X"), "--mtx "); // Check index base if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) diff --git a/library/src/level1/rocsparse_gthrz.hpp b/library/src/level1/rocsparse_gthrz.hpp index be9c9fae..3898db08 100644 --- a/library/src/level1/rocsparse_gthrz.hpp +++ b/library/src/level1/rocsparse_gthrz.hpp @@ -36,10 +36,7 @@ rocsparse_status rocsparse_gthrz_template(rocsparse_handle handle, (const void*&)x_ind, idx_base); - log_bench(handle, - "./rocsparse-bench -f gthrz -r", - replaceX("X"), - "--mtx "); + log_bench(handle, "./rocsparse-bench -f gthrz -r", replaceX("X"), "--mtx "); // Check index base if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) diff --git a/library/src/level1/rocsparse_sctr.hpp b/library/src/level1/rocsparse_sctr.hpp index 67716fed..07fad1be 100644 --- a/library/src/level1/rocsparse_sctr.hpp +++ b/library/src/level1/rocsparse_sctr.hpp @@ -36,10 +36,7 @@ rocsparse_status rocsparse_sctr_template(rocsparse_handle handle, (const void*&)y, idx_base); - log_bench(handle, - "./rocsparse-bench -f sctr -r", - replaceX("X"), - "--mtx "); + log_bench(handle, "./rocsparse-bench -f sctr -r", replaceX("X"), "--mtx "); // Check index base if(idx_base != rocsparse_index_base_zero && idx_base != rocsparse_index_base_one) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index f61642f0..81df7a95 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -185,7 +185,7 @@ static __device__ void csrmvn_general_device(rocsparse_int m, } } -__device__ static __inline__ void atomic_add(float *address, float val) +__device__ static __inline__ void atomic_add(float* address, float val) { unsigned int newVal; unsigned int prevVal; @@ -194,11 +194,10 @@ __device__ static __inline__ void atomic_add(float *address, float val) { prevVal = __float_as_uint(*address); newVal = __float_as_uint(val + *address); - } - while(atomicCAS((unsigned int*)address, prevVal, newVal) != prevVal); + } while(atomicCAS((unsigned int*)address, prevVal, newVal) != prevVal); } -__device__ static __inline__ void atomic_add(double *address, double val) +__device__ static __inline__ void atomic_add(double* address, double val) { unsigned long long newVal; unsigned long long prevVal; @@ -207,8 +206,7 @@ __device__ static __inline__ void atomic_add(double *address, double val) { prevVal = __double_as_longlong(*address); newVal = __double_as_longlong(val + *address); - } - while(atomicCAS((unsigned long long*)address, prevVal, newVal) != prevVal); + } while(atomicCAS((unsigned long long*)address, prevVal, newVal) != prevVal); } // rocsparse_int == int32_t @@ -229,8 +227,8 @@ __device__ static __inline__ rocsparse_int mad24(rocsparse_int x, rocsparse_int } template -static inline __device__ T -sum2_reduce(T cur_sum, T* partial, rocsparse_int lid, rocsparse_int max_size, rocsparse_int reduc_size) +static inline __device__ T sum2_reduce( + T cur_sum, T* partial, rocsparse_int lid, rocsparse_int max_size, rocsparse_int reduc_size) { if(max_size > reduc_size) { @@ -294,11 +292,12 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row. // If there are more items in this row, we assign more workgroups. rocsparse_int vecStart = mad24(wg, BLOCK_MULTIPLIER * BLOCKSIZE, csr_row_ptr[row] - idx_base); - rocsparse_int vecEnd = ((csr_row_ptr[row + 1] - idx_base) > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) - ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE - : (csr_row_ptr[row + 1] - idx_base); + rocsparse_int vecEnd = + ((csr_row_ptr[row + 1] - idx_base) > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) + ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE + : (csr_row_ptr[row + 1] - idx_base); - T temp_sum = 0.; + T temp_sum = 0.; // If the next row block starts more than 2 rows away, then we choose CSR-Stream. // If this is zero (long rows) or one (final workgroup in a long row, or a single @@ -339,7 +338,8 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, { for(rocsparse_int i = 0; i < BLOCKSIZE; i += WG_SIZE) { - partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i] - idx_base]; + partialSums[lid + i] = + alpha * csr_val[col + i] * x[csr_col_ind[col + i] - idx_base]; } } else @@ -353,7 +353,8 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // to be launched, and this loop can't be unrolled. for(rocsparse_int i = 0; col + i < csr_row_ptr[stop_row] - idx_base; i += WG_SIZE) { - partialSums[lid + i] = alpha * csr_val[col + i] * x[csr_col_ind[col + i] - idx_base]; + partialSums[lid + i] = + alpha * csr_val[col + i] * x[csr_col_ind[col + i] - idx_base]; } } __syncthreads(); @@ -428,7 +429,7 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, { rocsparse_int local_first_val = (csr_row_ptr[local_row] - csr_row_ptr[row]); rocsparse_int local_last_val = csr_row_ptr[local_row + 1] - csr_row_ptr[row]; - temp_sum = 0.; + temp_sum = 0.; for(rocsparse_int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++) { @@ -462,9 +463,9 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, { // Any workgroup only calculates, at most, BLOCKSIZE items in this row. // If there are more items in this row, we use CSR-LongRows. - temp_sum = 0.; - vecStart = csr_row_ptr[row] - idx_base; - vecEnd = csr_row_ptr[row + 1] - idx_base; + temp_sum = 0.; + vecStart = csr_row_ptr[row] - idx_base; + vecEnd = csr_row_ptr[row + 1] - idx_base; // Load in a bunch of partial results into your register space, rather than LDS (no // contention) @@ -521,7 +522,7 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, { // The first workgroup handles the output initialization. T out_val = y[row]; - temp_sum = (beta - 1.) * out_val; + temp_sum = (beta - 1.) * out_val; atomicXor(&row_blocks[first_wg_in_row], (1ULL << WG_BITS)); // Release other workgroups. } // For every other workgroup, bit 24 holds the value they wait on. diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index 2c048288..b9dac65f 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -6,8 +6,7 @@ #include "rocsparse.h" #include "rocsparse_csrmv.hpp" -__attribute__((unused)) -static unsigned int flp2(unsigned int x) +__attribute__((unused)) static unsigned int flp2(unsigned int x) { x |= (x >> 1); x |= (x >> 2); @@ -70,7 +69,8 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, unsigned long long last_i = 0; // Check to ensure nRows can fit in 32 bits - if(static_cast(nRows) > static_cast(std::pow(2, ROW_BITS))) + if(static_cast(nRows) > + static_cast(std::pow(2, ROW_BITS))) { fprintf(stderr, "nrow does not fit in 32 bits\n"); exit(1); @@ -159,11 +159,14 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, // This is csr-vector case; bottom WGBITS == workgroup ID if((i - last_i == 1) && sum > static_cast(BLOCKSIZE)) { - rocsparse_int numWGReq = static_cast(std::ceil(static_cast(row_length) / (BLOCK_MULTIPLIER * BLOCKSIZE))); + rocsparse_int numWGReq = static_cast( + std::ceil(static_cast(row_length) / (BLOCK_MULTIPLIER * BLOCKSIZE))); // Check to ensure #workgroups can fit in WGBITS bits, if not // then the last workgroup will do all the remaining work - numWGReq = (numWGReq < static_cast(std::pow(2, WG_BITS))) ? numWGReq : static_cast(std::pow(2, WG_BITS)); + numWGReq = (numWGReq < static_cast(std::pow(2, WG_BITS))) + ? numWGReq + : static_cast(std::pow(2, WG_BITS)); if(allocate_row_blocks) { @@ -228,7 +231,8 @@ static void ComputeRowBlocks(unsigned long long* rowBlocks, } // If we didn't fill a row block with the last row, make sure we don't lose it. - if(allocate_row_blocks && (*(rowBlocks - 1) >> (64 - ROW_BITS)) != static_cast(nRows)) + if(allocate_row_blocks && + (*(rowBlocks - 1) >> (64 - ROW_BITS)) != static_cast(nRows)) { *rowBlocks = (static_cast(nRows) << (64 - ROW_BITS)); if((nRows - last_i) > static_cast(ROWS_FOR_VECTOR)) @@ -350,7 +354,8 @@ extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, // Temporary arrays to hold device data std::vector hptr(m + 1); - RETURN_IF_HIP_ERROR(hipMemcpy(hptr.data(), csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR(hipMemcpy( + hptr.data(), csr_row_ptr, sizeof(rocsparse_int) * (m + 1), hipMemcpyDeviceToHost)); // Determine row blocks array size ComputeRowBlocks((unsigned long long*)NULL, info->csrmv_info->size, hptr.data(), m, false); @@ -358,27 +363,27 @@ extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, // Create row blocks structure std::vector row_blocks(info->csrmv_info->size, 0); - ComputeRowBlocks(row_blocks.data(), - info->csrmv_info->size, - hptr.data(), - m, - true); + ComputeRowBlocks(row_blocks.data(), info->csrmv_info->size, hptr.data(), m, true); // Allocate memory on device to hold csrmv info, if required if(info->csrmv_info->size > 0) { - RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->csrmv_info->row_blocks, sizeof(unsigned long long) * info->csrmv_info->size)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->csrmv_info->row_blocks, + sizeof(unsigned long long) * info->csrmv_info->size)); // Copy row blocks information to device - RETURN_IF_HIP_ERROR(hipMemcpy(info->csrmv_info->row_blocks, row_blocks.data(), sizeof(unsigned long long) * info->csrmv_info->size, hipMemcpyHostToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy(info->csrmv_info->row_blocks, + row_blocks.data(), + sizeof(unsigned long long) * info->csrmv_info->size, + hipMemcpyHostToDevice)); } // Store some pointers to verify correct execution - info->csrmv_info->trans = trans; - info->csrmv_info->m = m; - info->csrmv_info->n = n; - info->csrmv_info->nnz = nnz; - info->csrmv_info->descr = descr; + info->csrmv_info->trans = trans; + info->csrmv_info->m = m; + info->csrmv_info->n = n; + info->csrmv_info->nnz = nnz; + info->csrmv_info->descr = descr; info->csrmv_info->csr_row_ptr = csr_row_ptr; info->csrmv_info->csr_col_ind = csr_col_ind; @@ -402,9 +407,7 @@ extern "C" rocsparse_status rocsparse_csrmv_analysis_clear(rocsparse_handle hand } // Logging - log_trace(handle, - "rocsparse_csrmv_analysis_clear", - (const void*&)info); + log_trace(handle, "rocsparse_csrmv_analysis_clear", (const void*&)info); // Destroy csrmv info struct RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrmv_info(info->csrmv_info)); @@ -428,8 +431,20 @@ extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, float* y, const rocsparse_mat_info info) { - return rocsparse_csrmv_template( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); + return rocsparse_csrmv_template(handle, + trans, + m, + n, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + x, + beta, + y, + info); } extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, @@ -447,6 +462,18 @@ extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, double* y, const rocsparse_mat_info info) { - return rocsparse_csrmv_template( - handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info); + return rocsparse_csrmv_template(handle, + trans, + m, + n, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + x, + beta, + y, + info); } diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index 9ea5b824..4d8cf79c 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -51,34 +51,46 @@ __global__ void csrmvn_general_kernel_device_pointer(rocsparse_int m, } template -__launch_bounds__(WG_SIZE) -__global__ void csrmvn_adaptive_kernel_host_pointer(unsigned long long* __restrict__ row_blocks, - T alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ x, - T beta, - T* __restrict__ y, - rocsparse_index_base idx_base) +__launch_bounds__(WG_SIZE) __global__ + void csrmvn_adaptive_kernel_host_pointer(unsigned long long* __restrict__ row_blocks, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + T beta, + T* __restrict__ y, + rocsparse_index_base idx_base) { - csrmvn_adaptive_device( + csrmvn_adaptive_device( row_blocks, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); } template -__launch_bounds__(WG_SIZE) -__global__ void csrmvn_adaptive_kernel_device_pointer(unsigned long long* __restrict__ row_blocks, - const T* alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ x, - const T* beta, - T* __restrict__ y, - rocsparse_index_base idx_base) +__launch_bounds__(WG_SIZE) __global__ + void csrmvn_adaptive_kernel_device_pointer(unsigned long long* __restrict__ row_blocks, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + const T* beta, + T* __restrict__ y, + rocsparse_index_base idx_base) { - csrmvn_adaptive_device( + csrmvn_adaptive_device( row_blocks, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); } @@ -218,17 +230,32 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, if(info == nullptr) { // If csrmv info is not available, call csrmv general - return rocsparse_csrmv_general_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + return rocsparse_csrmv_general_template( + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } else if(info->csrmv_built == false) { // If csrmv info is not available, call csrmv general - return rocsparse_csrmv_general_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); + return rocsparse_csrmv_general_template( + handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y); } else { // If csrmv info is available, call csrmv adaptive - return rocsparse_csrmv_adaptive_template(handle, trans, m, n, nnz, alpha, descr, csr_val, csr_row_ptr, csr_col_ind, x, beta, y, info->csrmv_info); + return rocsparse_csrmv_adaptive_template(handle, + trans, + m, + n, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + x, + beta, + y, + info->csrmv_info); } } From 1f53964e94f12e0c39e08fa85413557c1c38ac65 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 11:53:46 +0200 Subject: [PATCH 220/304] version incremented to 0.1.2 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9393e149..66a4a3d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ list(APPEND CMAKE_MODULE_PATH include(cmake/SetToolchain.cmake) # rocSPARSE project -project(rocsparse VERSION 0.1.1.0 LANGUAGES CXX) +project(rocsparse VERSION 0.1.2.0 LANGUAGES CXX) set(rocsparse_SOVERSION 0) # Set a default build type if none was specified From e0f764029354486ee7b75e8b3fe114360fa81441 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 16 Aug 2018 22:18:37 +0200 Subject: [PATCH 221/304] disabled hcc-ctu while it seems to be outdated --- Jenkinsfile | 80 +++++++++++++++---------------- library/src/level2/csrmv_device.h | 10 ++-- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index a5dbe10c..36fa96eb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -429,48 +429,48 @@ def build_pipeline( compiler_data compiler_args, docker_data docker_args, projec } // The following launches 3 builds in parallel: hcc-ctu, hcc-1.6 and cuda -parallel hcc_ctu: -{ - try - { - node( 'docker && rocm && gfx900') - { - def docker_args = new docker_data( - from_image:'compute-artifactory:5001/rocm-developer-tools/hip/master/hip-hcc-ctu-ubuntu-16.04:latest', - build_docker_file:'dockerfile-build-ubuntu', - install_docker_file:'dockerfile-install-ubuntu', - docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', - docker_build_args:' --pull' ) - - def compiler_args = new compiler_data( - compiler_name:'hcc-ctu', - build_config:'Release', - compiler_path:'/opt/rocm/bin/hcc' ) - - def rocsparse_paths = new project_paths( - project_name:'rocsparse-hcc-ctu', - src_prefix:'src', - build_prefix:'src', - build_command: './install.sh -cd' ) - - def print_version_closure = { - sh """ - set -x - /opt/rocm/bin/hcc --version - """ - } - - build_pipeline( compiler_args, docker_args, rocsparse_paths, print_version_closure ) - } - } - catch( err ) - { - currentBuild.result = 'UNSTABLE' - } -}, +//parallel hcc_ctu: +//{ +// try +// { +// node( 'docker && rocm && gfx900') +// { +// def docker_args = new docker_data( +// from_image:'compute-artifactory:5001/rocm-developer-tools/hip/master/hip-hcc-ctu-ubuntu-16.04:latest', +// build_docker_file:'dockerfile-build-ubuntu', +// install_docker_file:'dockerfile-install-ubuntu', +// docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', +// docker_build_args:' --pull' ) +// +// def compiler_args = new compiler_data( +// compiler_name:'hcc-ctu', +// build_config:'Release', +// compiler_path:'/opt/rocm/bin/hcc' ) +// +// def rocsparse_paths = new project_paths( +// project_name:'rocsparse-hcc-ctu', +// src_prefix:'src', +// build_prefix:'src', +// build_command: './install.sh -cd' ) +// +// def print_version_closure = { +// sh """ +// set -x +// /opt/rocm/bin/hcc --version +// """ +// } +// +// build_pipeline( compiler_args, docker_args, rocsparse_paths, print_version_closure ) +// } +// } +// catch( err ) +// { +// currentBuild.result = 'UNSTABLE' +// } +//}, rocm_ubuntu: { - node( 'docker && rocm && gfx900') + node( 'docker && rocm && dkms') { def hcc_docker_args = new docker_data( from_image:'rocm/dev-ubuntu-16.04:1.7.1', diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 81df7a95..9763bc64 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -210,20 +210,20 @@ __device__ static __inline__ void atomic_add(double* address, double val) } // rocsparse_int == int32_t -__device__ static __inline__ int32_t mul24(int32_t x, int32_t y) +__device__ static __inline__ int32_t rocsparse_mul24(int32_t x, int32_t y) { return ((x << 8) >> 8) * ((y << 8) >> 8); } // rocsparse_int == int64_t -__device__ static __inline__ int64_t mul24(int64_t x, int64_t y) +__device__ static __inline__ int64_t rocsparse_mul24(int64_t x, int64_t y) { return ((x << 40) >> 40) * ((y << 40) >> 40); } -__device__ static __inline__ rocsparse_int mad24(rocsparse_int x, rocsparse_int y, rocsparse_int z) +__device__ static __inline__ rocsparse_int rocsparse_mad24(rocsparse_int x, rocsparse_int y, rocsparse_int z) { - return mul24(x, y) + z; + return rocsparse_mul24(x, y) + z; } template @@ -291,7 +291,7 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row. // If there are more items in this row, we assign more workgroups. - rocsparse_int vecStart = mad24(wg, BLOCK_MULTIPLIER * BLOCKSIZE, csr_row_ptr[row] - idx_base); + rocsparse_int vecStart = rocsparse_mad24(wg, BLOCK_MULTIPLIER * BLOCKSIZE, csr_row_ptr[row] - idx_base); rocsparse_int vecEnd = ((csr_row_ptr[row + 1] - idx_base) > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE From 28f3984500d57cde464c0dc308f7b410c9383b56 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 17 Aug 2018 07:49:59 +0200 Subject: [PATCH 222/304] changed XXX_EPSILON to std::limits --- clients/common/unit.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index 5268d138..496bffd8 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -151,7 +151,8 @@ void unit_check_near(rocsparse_int M, rocsparse_int N, float* hCPU, float* hGPU) { for(rocsparse_int i = 0; i < M; i++) { - float compare_val = std::max(std::abs(hCPU[i + j] * 1e-6f), 10 * FLT_EPSILON); + float compare_val = + std::max(std::abs(hCPU[i + j] * 1e-6f), 10 * std::numeric_limits::epsilon()); #ifdef GOOGLE_TEST ASSERT_NEAR(hCPU[i + j], hGPU[i + j], compare_val); #else @@ -168,7 +169,8 @@ void unit_check_near(rocsparse_int M, rocsparse_int N, double* hCPU, double* hGP { for(rocsparse_int i = 0; i < M; i++) { - double compare_val = std::max(std::abs(hCPU[i + j] * 1e-14), 10 * DBL_EPSILON); + double compare_val = std::max(std::abs(hCPU[i + j] * 1e-14), + 10 * std::numeric_limits::epsilon()); #ifdef GOOGLE_TEST ASSERT_NEAR(hCPU[i + j], hGPU[i + j], compare_val); #else From 180eaa30775f8a6ef7864c26881794cf90dad89d Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 23 Aug 2018 16:21:11 +0200 Subject: [PATCH 223/304] reorganized csrmv adaptive --- .../rocsparse_template_specialization.cpp | 16 +++--- clients/include/rocsparse.hpp | 4 +- clients/include/testing_csrmv.hpp | 50 +++++++++---------- clients/samples/example_csrmv.cpp | 8 +-- library/include/rocsparse-functions.h | 22 ++++---- library/src/level2/rocsparse_csrmv.cpp | 16 +++--- library/src/level2/rocsparse_csrmv.hpp | 12 ++--- 7 files changed, 64 insertions(+), 64 deletions(-) diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index ef9e2d39..57d72125 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -197,10 +197,10 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const float* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const float* x, const float* beta, - float* y, - const rocsparse_mat_info info) + float* y) { return rocsparse_scsrmv(handle, trans, @@ -212,10 +212,10 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, csr_val, csr_row_ptr, csr_col_ind, + info, x, beta, - y, - info); + y); } template <> @@ -229,10 +229,10 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const double* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const double* x, const double* beta, - double* y, - const rocsparse_mat_info info) + double* y) { return rocsparse_dcsrmv(handle, trans, @@ -244,10 +244,10 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, csr_val, csr_row_ptr, csr_col_ind, + info, x, beta, - y, - info); + y); } template <> diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index fb025f0c..f18c253b 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -87,10 +87,10 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const T* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const T* x, const T* beta, - T* y, - const rocsparse_mat_info info); + T* y); template rocsparse_status rocsparse_ellmv(rocsparse_handle handle, diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index 49e7a64d..d931179b 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -113,10 +113,10 @@ void testing_csrmv_bad_arg(void) dval, dptr_null, dcol, + nullptr, dx, &beta, - dy, - nullptr); + dy); verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); } // testing for(nullptr == dcol) @@ -133,10 +133,10 @@ void testing_csrmv_bad_arg(void) dval, dptr, dcol_null, + nullptr, dx, &beta, - dy, - nullptr); + dy); verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); } // testing for(nullptr == dval) @@ -153,10 +153,10 @@ void testing_csrmv_bad_arg(void) dval_null, dptr, dcol, + nullptr, dx, &beta, - dy, - nullptr); + dy); verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); } // testing for(nullptr == dx) @@ -173,10 +173,10 @@ void testing_csrmv_bad_arg(void) dval, dptr, dcol, + nullptr, dx_null, &beta, - dy, - nullptr); + dy); verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); } // testing for(nullptr == dy) @@ -193,10 +193,10 @@ void testing_csrmv_bad_arg(void) dval, dptr, dcol, + nullptr, dx, &beta, - dy_null, - nullptr); + dy_null); verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); } // testing for(nullptr == d_alpha) @@ -213,10 +213,10 @@ void testing_csrmv_bad_arg(void) dval, dptr, dcol, + nullptr, dx, &beta, - dy, - nullptr); + dy); verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); } // testing for(nullptr == d_beta) @@ -233,10 +233,10 @@ void testing_csrmv_bad_arg(void) dval, dptr, dcol, + nullptr, dx, d_beta_null, - dy, - nullptr); + dy); verify_rocsparse_status_invalid_pointer(status, "Error: beta is nullptr"); } // testing for(nullptr == descr) @@ -253,10 +253,10 @@ void testing_csrmv_bad_arg(void) dval, dptr, dcol, + nullptr, dx, &beta, - dy, - nullptr); + dy); verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); } // testing for(nullptr == handle) @@ -273,10 +273,10 @@ void testing_csrmv_bad_arg(void) dval, dptr, dcol, + nullptr, dx, &beta, - dy, - nullptr); + dy); verify_rocsparse_status_invalid_handle(status); } @@ -401,7 +401,7 @@ rocsparse_status testing_csrmv(Arguments argus) CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); status = rocsparse_csrmv( - handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy, info); + handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, info, dx, &h_beta, dy); if(m < 0 || n < 0 || nnz < 0) { @@ -540,12 +540,12 @@ rocsparse_status testing_csrmv(Arguments argus) // ROCSPARSE pointer mode host CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); CHECK_ROCSPARSE_ERROR(rocsparse_csrmv( - handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, dx, &h_beta, dy_1, info)); + handle, transA, m, n, nnz, &h_alpha, descr, dval, dptr, dcol, info, dx, &h_beta, dy_1)); // ROCSPARSE pointer mode device CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); CHECK_ROCSPARSE_ERROR(rocsparse_csrmv( - handle, transA, m, n, nnz, d_alpha, descr, dval, dptr, dcol, dx, d_beta, dy_2, info)); + handle, transA, m, n, nnz, d_alpha, descr, dval, dptr, dcol, info, dx, d_beta, dy_2)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); @@ -684,10 +684,10 @@ rocsparse_status testing_csrmv(Arguments argus) dval, dptr, dcol, + info, dx, &h_beta, - dy_1, - info); + dy_1); } double gpu_time_used = get_time_us(); // in microseconds @@ -704,10 +704,10 @@ rocsparse_status testing_csrmv(Arguments argus) dval, dptr, dcol, + info, dx, &h_beta, - dy_1, - info); + dy_1); } // Convert to miliseconds per call diff --git a/clients/samples/example_csrmv.cpp b/clients/samples/example_csrmv.cpp index d8e9bbdb..c6b2ecfb 100644 --- a/clients/samples/example_csrmv.cpp +++ b/clients/samples/example_csrmv.cpp @@ -95,10 +95,10 @@ int main(int argc, char* argv[]) dAval, dAptr, dAcol, + nullptr, dx, &hbeta, - dy, - nullptr); + dy); } // Device synchronization @@ -123,10 +123,10 @@ int main(int argc, char* argv[]) dAval, dAptr, dAcol, + nullptr, dx, &hbeta, - dy, - nullptr); + dy); } // Device synchronization diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 1c2dbdb2..4111afd5 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -626,6 +626,9 @@ rocsparse_status rocsparse_csrmv_analysis_clear(rocsparse_handle handle, rocspar @param[in] csr_col_ind array of nnz elements containing the column indices of A. @param[in] + info [optional] information collected by rocsparse_csrmv_analysis. + if nullptr is passed, general csrmv routine will be called. + @param[in] x array of n elements (op(A) = A) or m elements (op(A) = A^T or op(A) = A^H). @param[in] @@ -633,9 +636,6 @@ rocsparse_status rocsparse_csrmv_analysis_clear(rocsparse_handle handle, rocspar @param[inout] y array of m elements (op(A) = A) or n elements (op(A) = A^T or op(A) = A^H). - @param[in] - info [optional] information collected by rocsparse_csrmv_analysis. - if nullptr is passed, general csrmv routine will be called. ********************************************************************/ ROCSPARSE_EXPORT @@ -649,10 +649,10 @@ rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, const float* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const float* x, const float* beta, - float* y, - const rocsparse_mat_info info); + float* y); ROCSPARSE_EXPORT rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, @@ -665,10 +665,10 @@ rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, const double* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const double* x, const double* beta, - double* y, - const rocsparse_mat_info info); + double* y); /* ROCSPARSE_EXPORT rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, @@ -681,10 +681,10 @@ rocsparse_status rocsparse_ccsrmv(rocsparse_handle handle, const rocsparse_float_complex* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const rocsparse_float_complex* x, const rocsparse_float_complex* beta, - rocsparse_float_complex* y, - const rocsparse_mat_info info); + rocsparse_float_complex* y); ROCSPARSE_EXPORT rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, @@ -697,10 +697,10 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, const rocsparse_double_complex* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const rocsparse_double_complex* x, const rocsparse_double_complex* beta, - rocsparse_double_complex* y, - const rocsparse_mat_info info); + rocsparse_double_complex* y); */ /*! \brief SPARSE Level 2 API diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index b9dac65f..e83311df 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -426,10 +426,10 @@ extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, const float* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const float* x, const float* beta, - float* y, - const rocsparse_mat_info info) + float* y) { return rocsparse_csrmv_template(handle, trans, @@ -441,10 +441,10 @@ extern "C" rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, csr_val, csr_row_ptr, csr_col_ind, + info, x, beta, - y, - info); + y); } extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, @@ -457,10 +457,10 @@ extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, const double* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const double* x, const double* beta, - double* y, - const rocsparse_mat_info info) + double* y) { return rocsparse_csrmv_template(handle, trans, @@ -472,8 +472,8 @@ extern "C" rocsparse_status rocsparse_dcsrmv(rocsparse_handle handle, csr_val, csr_row_ptr, csr_col_ind, + info, x, beta, - y, - info); + y); } diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index 4d8cf79c..5b545439 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -105,10 +105,10 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, const T* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, const T* x, const T* beta, - T* y, - const rocsparse_mat_info info) + T* y) { // Check for valid handle and matrix descriptor if(handle == nullptr) @@ -252,10 +252,10 @@ rocsparse_status rocsparse_csrmv_template(rocsparse_handle handle, csr_val, csr_row_ptr, csr_col_ind, + info->csrmv_info, x, beta, - y, - info->csrmv_info); + y); } } @@ -712,10 +712,10 @@ rocsparse_status rocsparse_csrmv_adaptive_template(rocsparse_handle handle, const T* csr_val, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, + rocsparse_csrmv_info info, const T* x, const T* beta, - T* y, - const rocsparse_csrmv_info info) + T* y) { // Check if info matches current matrix and options if(info->trans != trans) From b3bee1a951ede341b0b0f50b1e9229169d3fb662 Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Thu, 23 Aug 2018 16:46:44 -0400 Subject: [PATCH 224/304] Add option --hip-clang to install.sh --- cmake/SetToolchain.cmake | 2 +- install.sh | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/cmake/SetToolchain.cmake b/cmake/SetToolchain.cmake index 467176da..e71d00c1 100644 --- a/cmake/SetToolchain.cmake +++ b/cmake/SetToolchain.cmake @@ -2,7 +2,7 @@ find_package(HIP REQUIRED) # Select toolchain -if(HIP_PLATFORM STREQUAL "nvcc") +if(HIP_PLATFORM STREQUAL "nvcc" OR HIP_COMPILER STREQUAL "clang") # Find HIPCC executable find_program( HIP_HIPCC_EXECUTABLE diff --git a/install.sh b/install.sh index afc68ebd..60b4491a 100755 --- a/install.sh +++ b/install.sh @@ -40,6 +40,7 @@ function display_help() echo " [-c|--clients] build library clients too (combines with -i & -d)" echo " [-g|--debug] -DCMAKE_BUILD_TYPE=Debug (default is =Release)" echo " [--cuda] build library for cuda backend" + echo " [--hip-clang] build library with hip-clang" } # This function is helpful for dockerfiles that do not have sudo installed, but the default user is root @@ -61,6 +62,7 @@ install_package=false install_dependencies=false build_clients=false build_cuda=false +build_hip_clang=false build_release=true # ################################################# @@ -70,7 +72,7 @@ build_release=true # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T if [[ $? -eq 4 ]]; then - GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,install,clients,dependencies,debug,cuda --options hicgd -- "$@") + GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,install,clients,dependencies,debug,cuda,hip-clang --options hicgd -- "$@") else echo "Need a new version of getopt" exit 1 @@ -104,6 +106,9 @@ while true; do --cuda) build_cuda=true shift ;; + --hip-clang) + build_hip_clang=true + shift ;; --) shift ; break ;; *) echo "Unexpected command line parameter received; aborting"; exit 1 @@ -196,7 +201,14 @@ pushd . # On ROCm platforms, hcc compiler can build everything if [[ "${build_cuda}" == false ]]; then - CXX=hcc cmake ${cmake_common_options} ${cmake_client_options} -DCMAKE_PREFIX_PATH="$(pwd)/../deps/deps-install" ../.. + if [[ "${build_hip_clang}" == true ]]; then + CXX=hipcc + HIP_COMPILER=clang + else + CXX=hcc + HIP_COMPILER=hcc + fi + CXX=$CXX cmake -DHIP_COMPILER=$HIP_COMPILER ${cmake_common_options} ${cmake_client_options} -DCMAKE_PREFIX_PATH="$(pwd)/../deps/deps-install" ../.. make -j$(nproc) else # The nvidia compile is a little more complicated, in that we split compiling the library from the clients From 262cc1392706f73e34957e05ae368eacfe123401 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 24 Aug 2018 11:33:06 +0200 Subject: [PATCH 225/304] cleaning a bit --- clients/common/utility.cpp | 2 +- clients/include/testing_csrmv.hpp | 2 +- library/src/conversion/csr2coo_device.h | 16 ++-- library/src/conversion/rocsparse_csr2coo.cpp | 4 +- library/src/handle.cpp | 4 +- library/src/include/handle.h | 4 +- library/src/level2/coomv_device.h | 77 +++++++++---------- library/src/level2/csrmv_device.h | 60 +++++++-------- library/src/level2/rocsparse_coomv.hpp | 78 ++++++++++---------- library/src/level2/rocsparse_csrmv.hpp | 16 ++-- library/src/level3/csrmm_device.h | 76 +++++++++---------- library/src/level3/rocsparse_csrmm.hpp | 28 +++---- 12 files changed, 184 insertions(+), 183 deletions(-) diff --git a/clients/common/utility.cpp b/clients/common/utility.cpp index 9d371bea..4f870f4e 100644 --- a/clients/common/utility.cpp +++ b/clients/common/utility.cpp @@ -46,7 +46,7 @@ rocsparse_int query_device_property() (int)(props.clockRate / 1000), props.major, props.minor); - printf("maxGridDimX %d, sharedMemPerBlock %ldKB, maxThreadsPerBlock %d, warpSize %d\n", + printf("maxGridDimX %d, sharedMemPerBlock %ldKB, maxThreadsPerBlock %d, wavefrontSize %d\n", props.maxGridSize[0], props.sharedMemPerBlock >> 10, props.maxThreadsPerBlock, diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index d931179b..76a39cf3 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -575,7 +575,7 @@ rocsparse_status testing_csrmv(Arguments argus) } else { - // Query for warpSize + // Query for wavefrontSize hipDeviceProp_t prop; hipGetDeviceProperties(&prop, 0); diff --git a/library/src/conversion/csr2coo_device.h b/library/src/conversion/csr2coo_device.h index 410d755e..5b7cdbf6 100644 --- a/library/src/conversion/csr2coo_device.h +++ b/library/src/conversion/csr2coo_device.h @@ -9,22 +9,22 @@ #include // CSR to COO matrix conversion kernel -template +template __global__ void csr2coo_kernel(rocsparse_int m, const rocsparse_int* csr_row_ptr, rocsparse_int* coo_row_ind, rocsparse_index_base idx_base) { - rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - rocsparse_int lid = hipThreadIdx_x % THREADS; - rocsparse_int vid = gid / THREADS; - rocsparse_int nvec = hipGridDim_x * hipBlockDim_x / THREADS; + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int lid = tid & (WF_SIZE - 1); + rocsparse_int nwf = hipGridDim_x * hipBlockDim_x / WF_SIZE; - for(rocsparse_int ai = vid; ai < m; ai += nvec) + for(rocsparse_int row = gid / WF_SIZE; row < m; row += nwf) { - for(rocsparse_int aj = csr_row_ptr[ai] + lid; aj < csr_row_ptr[ai + 1]; aj += THREADS) + for(rocsparse_int aj = csr_row_ptr[row] + lid; aj < csr_row_ptr[row + 1]; aj += WF_SIZE) { - coo_row_ind[aj - idx_base] = ai + idx_base; + coo_row_ind[aj - idx_base] = row + idx_base; } } } diff --git a/library/src/conversion/rocsparse_csr2coo.cpp b/library/src/conversion/rocsparse_csr2coo.cpp index a7de6980..7baaafe8 100644 --- a/library/src/conversion/rocsparse_csr2coo.cpp +++ b/library/src/conversion/rocsparse_csr2coo.cpp @@ -68,7 +68,7 @@ extern "C" rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, dim3 csr2coo_blocks((m - 1) / CSR2COO_DIM + 1); dim3 csr2coo_threads(CSR2COO_DIM); - if(handle->warp_size == 32) + if(handle->wavefront_size == 32) { if(nnz_per_row < 4) { @@ -131,7 +131,7 @@ extern "C" rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, idx_base); } } - else if(handle->warp_size == 64) + else if(handle->wavefront_size == 64) { if(nnz_per_row < 4) { diff --git a/library/src/handle.cpp b/library/src/handle.cpp index dd2ff277..a6081f7d 100644 --- a/library/src/handle.cpp +++ b/library/src/handle.cpp @@ -17,8 +17,8 @@ _rocsparse_handle::_rocsparse_handle() THROW_IF_HIP_ERROR(hipGetDevice(&device)); THROW_IF_HIP_ERROR(hipGetDeviceProperties(&properties, device)); - // Device warp size - warp_size = properties.warpSize; + // Device wavefront size + wavefront_size = properties.warpSize; // Layer mode char* str_layer_mode; diff --git a/library/src/include/handle.h b/library/src/include/handle.h index d43afcf1..71f32758 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -38,8 +38,8 @@ struct _rocsparse_handle int device; // device properties hipDeviceProp_t properties; - // device warp size - int warp_size; + // device wavefront size + int wavefront_size; // stream ; default stream is system stream NULL hipStream_t stream = 0; // pointer mode ; default mode is host diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h index 069ecc66..4d1003ff 100644 --- a/library/src/level2/coomv_device.h +++ b/library/src/level2/coomv_device.h @@ -25,36 +25,36 @@ __global__ void coomv_scale(rocsparse_int size, T scalar, T* __restrict__ data) // Implementation motivated by papers 'Efficient Sparse Matrix-Vector Multiplication on CUDA', // 'Implementing Sparse Matrix-Vector Multiplication on Throughput-Oriented Processors' and // 'Segmented operations for sparse matrix computation on vector multiprocessors' -template -static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, - rocsparse_int loops, - T alpha, - const rocsparse_int* coo_row_ind, - const rocsparse_int* coo_col_ind, - const T* coo_val, - const T* x, - T* y, - rocsparse_int* row_block_red, - T* val_block_red, - rocsparse_index_base idx_base) +template +static __device__ void coomvn_general_wf_reduce(rocsparse_int nnz, + rocsparse_int loops, + T alpha, + const rocsparse_int* coo_row_ind, + const rocsparse_int* coo_col_ind, + const T* coo_val, + const T* x, + T* y, + rocsparse_int* row_block_red, + T* val_block_red, + rocsparse_index_base idx_base) { rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; rocsparse_int tid = hipThreadIdx_x; - // Lane index (0,...,WARPSIZE) - rocsparse_int laneid = gid % WARPSIZE; - // Warp index - rocsparse_int warpid = gid / WARPSIZE; + // Lane index (0,...,WF_SIZE) + rocsparse_int lid = gid % WF_SIZE; + // Wavefront index + rocsparse_int wid = gid / WF_SIZE; // Initialize block buffers - if(laneid == 0) + if(lid == 0) { - row_block_red[warpid] = -1; - val_block_red[warpid] = static_cast(0); + row_block_red[wid] = -1; + val_block_red[wid] = static_cast(0); } - // Global COO array index start for current warp - rocsparse_int offset = warpid * loops * WARPSIZE; + // Global COO array index start for current wavefront + rocsparse_int offset = wid * loops * WF_SIZE; // Shared memory to hold row indices and values for segmented reduction __shared__ rocsparse_int shared_row[BLOCKSIZE]; @@ -67,7 +67,7 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, __syncthreads(); // Quick return when thread is out of bounds - if(offset + laneid >= nnz) + if(offset + lid >= nnz) { return; } @@ -76,14 +76,14 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, T val; // Current threads index into COO structure - rocsparse_int idx = offset + laneid; + rocsparse_int idx = offset + lid; // Each thread processes 'loop' COO entries - while(idx < offset + loops * WARPSIZE) + while(idx < offset + loops * WF_SIZE) { // Get corresponding COO entry, if not out of bounds. // This can happen when processing more than 1 entry if - // nnz % WARPSIZE != 0 + // nnz % WF_SIZE != 0 if(idx < nnz) { row = coo_row_ind[idx] - idx_base; @@ -95,19 +95,19 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, val = static_cast(0); } - // First thread in warp checks row index from previous loop + // First thread in wavefront checks row index from previous loop // if it has been completed or if additional rows have to be // appended. - if(idx > offset && laneid == 0) + if(idx > offset && lid == 0) { - rocsparse_int prevrow = shared_row[tid + WARPSIZE - 1]; + rocsparse_int prevrow = shared_row[tid + WF_SIZE - 1]; if(row == prevrow) { - val += shared_val[tid + WARPSIZE - 1]; + val += shared_val[tid + WF_SIZE - 1]; } else if(prevrow >= 0) { - y[prevrow] += shared_val[tid + WARPSIZE - 1]; + y[prevrow] += shared_val[tid + WF_SIZE - 1]; } } @@ -120,10 +120,10 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, __syncthreads(); #pragma unroll - // Segmented warp reduction - for(rocsparse_int j = 1; j < WARPSIZE; j <<= 1) + // Segmented wavefront reduction + for(rocsparse_int j = 1; j < WF_SIZE; j <<= 1) { - if(laneid >= j) + if(lid >= j) { if(row == shared_row[tid - j]) { @@ -139,7 +139,7 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, // All lanes but the last one write their result in y. // The last value might need to be appended by the next iteration. - if(laneid < WARPSIZE - 1) + if(lid < WF_SIZE - 1) { if(row != shared_row[tid + 1] && row >= 0) { @@ -148,14 +148,14 @@ static __device__ void coomvn_general_warp_reduce(rocsparse_int nnz, } // Keep going for the next iteration - idx += WARPSIZE; + idx += WF_SIZE; } // Write last entries into buffers for segmented block reduction - if(laneid == WARPSIZE - 1) + if(lid == WF_SIZE - 1) { - row_block_red[warpid] = row; - val_block_red[warpid] = val; + row_block_red[wid] = row; + val_block_red[wid] = val; } } @@ -164,6 +164,7 @@ template static __device__ void segmented_blockreduce(const rocsparse_int* rows, T* vals) { rocsparse_int tid = hipThreadIdx_x; + #pragma unroll for(rocsparse_int j = 1; j < BLOCKSIZE; j <<= 1) { diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 9763bc64..03838101 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -11,7 +11,7 @@ __device__ int __llvm_amdgcn_readlane(int index, int offset) __asm("llvm.amdgcn. #if defined(__HIP_PLATFORM_HCC__) // Swizzle-based float wavefront reduction -template +template __device__ float wf_reduce(float sum) { typedef union flt_b32 @@ -24,37 +24,37 @@ __device__ float wf_reduce(float sum) flt_b32_t temp_sum; temp_sum.val = sum; - if(SUBWAVE_SIZE > 1) + if(WF_SIZE > 1) { upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x80b1); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 2) + if(WF_SIZE > 2) { upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x804e); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 4) + if(WF_SIZE > 4) { upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x101f); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 8) + if(WF_SIZE > 8) { upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x201f); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 16) + if(WF_SIZE > 16) { upper_sum.b32 = __hip_ds_swizzle(temp_sum.b32, 0x401f); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 32) + if(WF_SIZE > 32) { upper_sum.b32 = __llvm_amdgcn_readlane(temp_sum.b32, 32); temp_sum.val += upper_sum.val; @@ -65,7 +65,7 @@ __device__ float wf_reduce(float sum) } // Swizzle-based double wavefront reduction -template +template __device__ double wf_reduce(double sum) { typedef union dbl_b32 @@ -78,42 +78,42 @@ __device__ double wf_reduce(double sum) dbl_b32_t temp_sum; temp_sum.val = sum; - if(SUBWAVE_SIZE > 1) + if(WF_SIZE > 1) { upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x80b1); upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x80b1); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 2) + if(WF_SIZE > 2) { upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x804e); upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x804e); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 4) + if(WF_SIZE > 4) { upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x101f); upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x101f); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 8) + if(WF_SIZE > 8) { upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x201f); upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x201f); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 16) + if(WF_SIZE > 16) { upper_sum.b32[0] = __hip_ds_swizzle(temp_sum.b32[0], 0x401f); upper_sum.b32[1] = __hip_ds_swizzle(temp_sum.b32[1], 0x401f); temp_sum.val += upper_sum.val; } - if(SUBWAVE_SIZE > 32) + if(WF_SIZE > 32) { upper_sum.b32[0] = __llvm_amdgcn_readlane(temp_sum.b32[0], 32); upper_sum.b32[1] = __llvm_amdgcn_readlane(temp_sum.b32[1], 32); @@ -124,10 +124,10 @@ __device__ double wf_reduce(double sum) return sum; } #elif defined(__HIP_PLATFORM_NVCC__) -template +template __device__ T wf_reduce(T sum) { - for(rocsparse_int i = SUBWAVE_SIZE >> 1; i > 0; i >>= 1) + for(rocsparse_int i = WF_SIZE >> 1; i > 0; i >>= 1) { sum += __shfl_down_sync(0xffffffff, sum, i); } @@ -136,7 +136,7 @@ __device__ T wf_reduce(T sum) } #endif -template +template static __device__ void csrmvn_general_device(rocsparse_int m, T alpha, const rocsparse_int* row_offset, @@ -147,33 +147,33 @@ static __device__ void csrmvn_general_device(rocsparse_int m, T* y, rocsparse_index_base idx_base) { - rocsparse_int tid = hipThreadIdx_x; - rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; - rocsparse_int lid = tid & (SUBWAVE_SIZE - 1); - rocsparse_int nwarps = hipGridDim_x * hipBlockDim_x / SUBWAVE_SIZE; + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int lid = tid & (WF_SIZE - 1); + rocsparse_int nwf = hipGridDim_x * hipBlockDim_x / WF_SIZE; - // Loop over rows each subwave processes - for(rocsparse_int row = gid / SUBWAVE_SIZE; row < m; row += nwarps) + // Loop over rows + for(rocsparse_int row = gid / WF_SIZE; row < m; row += nwf) { - // Each subwave processes one row + // Each wavefront processes one row rocsparse_int row_start = row_offset[row] - idx_base; rocsparse_int row_end = row_offset[row + 1] - idx_base; - T sum = 0.0; + T sum = static_cast(0); - // Loop over non-zero elements of subwave row - for(rocsparse_int j = row_start + lid; j < row_end; j += SUBWAVE_SIZE) + // Loop over non-zero elements + for(rocsparse_int j = row_start + lid; j < row_end; j += WF_SIZE) { sum = fma(alpha * csr_val[j], __ldg(x + csr_col_ind[j] - idx_base), sum); } // Obtain row sum using parallel reduction - sum = wf_reduce(sum); + sum = wf_reduce(sum); - // First thread of each subwave writes result into global memory + // First thread of each wavefront writes result into global memory if(lid == 0) { - if(beta == 0) + if(beta == static_cast(0)) { y[row] = sum; } diff --git a/library/src/level2/rocsparse_coomv.hpp b/library/src/level2/rocsparse_coomv.hpp index fa2b7aaa..078ffd73 100644 --- a/library/src/level2/rocsparse_coomv.hpp +++ b/library/src/level2/rocsparse_coomv.hpp @@ -14,9 +14,9 @@ #include -template +template __launch_bounds__(128) __global__ - void coomvn_warp_host_pointer(rocsparse_int nnz, + void coomvn_wf_host_pointer(rocsparse_int nnz, rocsparse_int loops, T alpha, const rocsparse_int* __restrict__ coo_row_ind, @@ -28,22 +28,22 @@ __launch_bounds__(128) __global__ T* __restrict__ val_block_red, rocsparse_index_base idx_base) { - coomvn_general_warp_reduce(nnz, - loops, - alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - idx_base); + coomvn_general_wf_reduce(nnz, + loops, + alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + idx_base); } -template +template __launch_bounds__(128) __global__ - void coomvn_warp_device_pointer(rocsparse_int nnz, + void coomvn_wf_device_pointer(rocsparse_int nnz, rocsparse_int loops, const T* alpha, const rocsparse_int* __restrict__ coo_row_ind, @@ -55,17 +55,17 @@ __launch_bounds__(128) __global__ T* __restrict__ val_block_red, rocsparse_index_base idx_base) { - coomvn_general_warp_reduce(nnz, - loops, - *alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - idx_base); + coomvn_general_wf_reduce(nnz, + loops, + *alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + idx_base); } template @@ -213,8 +213,8 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, rocsparse_int minblocks = (nnz - 1) / COOMVN_DIM + 1; rocsparse_int nblocks = maxblocks < minblocks ? maxblocks : minblocks; - rocsparse_int nwarps = nblocks * (COOMVN_DIM / handle->warp_size); - rocsparse_int nloops = (nnz / handle->warp_size + 1) / nwarps + 1; + rocsparse_int nwfs = nblocks * (COOMVN_DIM / handle->wavefront_size); + rocsparse_int nloops = (nnz / handle->wavefront_size + 1) / nwfs + 1; dim3 coomvn_blocks(nblocks); dim3 coomvn_threads(COOMVN_DIM); @@ -223,8 +223,8 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, T* val_block_red = NULL; // Allocating a maximum of 8 kByte - RETURN_IF_HIP_ERROR(hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwarps)); - RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwarps)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwfs)); + RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwfs)); if(handle->pointer_mode == rocsparse_pointer_mode_device) { @@ -248,9 +248,9 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, y); } - if(handle->warp_size == 32) + if(handle->wavefront_size == 32) { - hipLaunchKernelGGL((coomvn_warp_device_pointer), + hipLaunchKernelGGL((coomvn_wf_device_pointer), coomvn_blocks, coomvn_threads, 0, @@ -267,9 +267,9 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, val_block_red, descr->base); } - else if(handle->warp_size == 64) + else if(handle->wavefront_size == 64) { - hipLaunchKernelGGL((coomvn_warp_device_pointer), + hipLaunchKernelGGL((coomvn_wf_device_pointer), coomvn_blocks, coomvn_threads, 0, @@ -315,9 +315,9 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, y); } - if(handle->warp_size == 32) + if(handle->wavefront_size == 32) { - hipLaunchKernelGGL((coomvn_warp_host_pointer), + hipLaunchKernelGGL((coomvn_wf_host_pointer), coomvn_blocks, coomvn_threads, 0, @@ -334,9 +334,9 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, val_block_red, descr->base); } - else if(handle->warp_size == 64) + else if(handle->wavefront_size == 64) { - hipLaunchKernelGGL((coomvn_warp_host_pointer), + hipLaunchKernelGGL((coomvn_wf_host_pointer), coomvn_blocks, coomvn_threads, 0, @@ -364,7 +364,7 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, coomvn_threads, 0, stream, - nwarps, + nwfs, row_block_red, val_block_red, y); diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index 5b545439..bacd214c 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -20,7 +20,7 @@ #define ROW_BITS 32 #define WG_SIZE 256 -template +template __global__ void csrmvn_general_kernel_host_pointer(rocsparse_int m, T alpha, const rocsparse_int* __restrict__ csr_row_ptr, @@ -31,11 +31,11 @@ __global__ void csrmvn_general_kernel_host_pointer(rocsparse_int m, T* __restrict__ y, rocsparse_index_base idx_base) { - csrmvn_general_device( + csrmvn_general_device( m, alpha, csr_row_ptr, csr_col_ind, csr_val, x, beta, y, idx_base); } -template +template __global__ void csrmvn_general_kernel_device_pointer(rocsparse_int m, const T* alpha, const rocsparse_int* __restrict__ csr_row_ptr, @@ -46,7 +46,7 @@ __global__ void csrmvn_general_kernel_device_pointer(rocsparse_int m, T* __restrict__ y, rocsparse_index_base idx_base) { - csrmvn_general_device( + csrmvn_general_device( m, *alpha, csr_row_ptr, csr_col_ind, csr_val, x, *beta, y, idx_base); } @@ -288,7 +288,7 @@ rocsparse_status rocsparse_csrmv_general_template(rocsparse_handle handle, if(handle->pointer_mode == rocsparse_pointer_mode_device) { - if(handle->warp_size == 32) + if(handle->wavefront_size == 32) { if(nnz_per_row < 4) { @@ -376,7 +376,7 @@ rocsparse_status rocsparse_csrmv_general_template(rocsparse_handle handle, descr->base); } } - else if(handle->warp_size == 64) + else if(handle->wavefront_size == 64) { if(nnz_per_row < 4) { @@ -493,7 +493,7 @@ rocsparse_status rocsparse_csrmv_general_template(rocsparse_handle handle, return rocsparse_status_success; } - if(handle->warp_size == 32) + if(handle->wavefront_size == 32) { if(nnz_per_row < 4) { @@ -581,7 +581,7 @@ rocsparse_status rocsparse_csrmv_general_template(rocsparse_handle handle, descr->base); } } - else if(handle->warp_size == 64) + else if(handle->wavefront_size == 64) { if(nnz_per_row < 4) { diff --git a/library/src/level3/csrmm_device.h b/library/src/level3/csrmm_device.h index f8ab2911..1c18feda 100644 --- a/library/src/level3/csrmm_device.h +++ b/library/src/level3/csrmm_device.h @@ -4,7 +4,7 @@ #include -template +template static __device__ void csrmmnn_general_device(rocsparse_int M, rocsparse_int N, rocsparse_int K, @@ -20,41 +20,41 @@ static __device__ void csrmmnn_general_device(rocsparse_int M, rocsparse_int ldc, rocsparse_index_base idx_base) { - rocsparse_int tid = hipThreadIdx_x; - rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; - rocsparse_int warpid = gid / SUBWAVE_SIZE; - rocsparse_int laneid = gid & (SUBWAVE_SIZE - 1); - rocsparse_int subid = tid / SUBWAVE_SIZE; - rocsparse_int nwarps = hipGridDim_x * hipBlockDim_x / SUBWAVE_SIZE; - rocsparse_int col = laneid + hipBlockIdx_y * SUBWAVE_SIZE; - rocsparse_int colB = col * ldb; - rocsparse_int colC = col * ldc; - - __shared__ rocsparse_int shared_col[BLOCKSIZE / SUBWAVE_SIZE][SUBWAVE_SIZE]; - __shared__ T shared_val[BLOCKSIZE / SUBWAVE_SIZE][SUBWAVE_SIZE]; - - for(rocsparse_int row = warpid; row < M; row += nwarps) + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int lid = gid & (WF_SIZE - 1); + rocsparse_int wid = tid / WF_SIZE; + rocsparse_int nwf = hipGridDim_x * hipBlockDim_x / WF_SIZE; + rocsparse_int col = lid + hipBlockIdx_y * WF_SIZE; + + rocsparse_int colB = col * ldb; + rocsparse_int colC = col * ldc; + + __shared__ rocsparse_int shared_col[BLOCKSIZE / WF_SIZE][WF_SIZE]; + __shared__ T shared_val[BLOCKSIZE / WF_SIZE][WF_SIZE]; + + for(rocsparse_int row = gid / WF_SIZE; row < M; row += nwf) { rocsparse_int row_start = __ldg(csr_row_ptr + row) - idx_base; rocsparse_int row_end = __ldg(csr_row_ptr + row + 1) - idx_base; T sum = static_cast(0); - for(rocsparse_int j = row_start; j < row_end; j += SUBWAVE_SIZE) + for(rocsparse_int j = row_start; j < row_end; j += WF_SIZE) { - rocsparse_int k = j + laneid; + rocsparse_int k = j + lid; __syncthreads(); - shared_col[subid][laneid] = (k < row_end) ? __ldg(csr_col_ind + k) - idx_base : 0; - shared_val[subid][laneid] = + shared_col[wid][lid] = (k < row_end) ? __ldg(csr_col_ind + k) - idx_base : 0; + shared_val[wid][lid] = (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); __syncthreads(); - for(rocsparse_int i = 0; i < SUBWAVE_SIZE && col < N; ++i) + for(rocsparse_int i = 0; i < WF_SIZE && col < N; ++i) { - sum += shared_val[subid][i] * __ldg(&B[shared_col[subid][i] + colB]); + sum += shared_val[wid][i] * __ldg(&B[shared_col[wid][i] + colB]); } } @@ -72,7 +72,7 @@ static __device__ void csrmmnn_general_device(rocsparse_int M, } } -template +template static __device__ void csrmmnt_general_device(rocsparse_int offset, rocsparse_int ncol, rocsparse_int M, @@ -90,44 +90,44 @@ static __device__ void csrmmnt_general_device(rocsparse_int offset, rocsparse_int ldc, rocsparse_index_base idx_base) { - rocsparse_int tid = hipThreadIdx_x; - rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; - rocsparse_int row = gid / SUBWAVE_SIZE; - rocsparse_int laneid = tid & (SUBWAVE_SIZE - 1); - rocsparse_int subid = hipThreadIdx_x / SUBWAVE_SIZE; + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int row = gid / WF_SIZE; + rocsparse_int lid = tid & (WF_SIZE - 1); + rocsparse_int wid = tid / WF_SIZE; if(row >= M) { return; } - __shared__ rocsparse_int shared_col[BLOCKSIZE / SUBWAVE_SIZE][SUBWAVE_SIZE]; - __shared__ T shared_val[BLOCKSIZE / SUBWAVE_SIZE][SUBWAVE_SIZE]; + __shared__ rocsparse_int shared_col[BLOCKSIZE / WF_SIZE][WF_SIZE]; + __shared__ T shared_val[BLOCKSIZE / WF_SIZE][WF_SIZE]; rocsparse_int row_start = __ldg(csr_row_ptr + row) - idx_base; rocsparse_int row_end = __ldg(csr_row_ptr + row + 1) - idx_base; - for(rocsparse_int l = offset; l < ncol; l += SUBWAVE_SIZE) + for(rocsparse_int l = offset; l < ncol; l += WF_SIZE) { - rocsparse_int col = l + laneid; + rocsparse_int col = l + lid; T sum = static_cast(0); - for(rocsparse_int j = row_start; j < row_end; j += SUBWAVE_SIZE) + for(rocsparse_int j = row_start; j < row_end; j += WF_SIZE) { - rocsparse_int k = j + laneid; + rocsparse_int k = j + lid; __syncthreads(); - shared_col[subid][laneid] = (k < row_end) ? N * (__ldg(csr_col_ind + k) - idx_base) : 0; - shared_val[subid][laneid] = + shared_col[wid][lid] = (k < row_end) ? N * (__ldg(csr_col_ind + k) - idx_base) : 0; + shared_val[wid][lid] = (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); __syncthreads(); - for(rocsparse_int i = 0; i < SUBWAVE_SIZE; ++i) + for(rocsparse_int i = 0; i < WF_SIZE; ++i) { - T val_B = (col < ncol) ? __ldg(B + col + shared_col[subid][i]) : static_cast(0); - sum += shared_val[subid][i] * val_B; + T val_B = (col < ncol) ? __ldg(B + col + shared_col[wid][i]) : static_cast(0); + sum += shared_val[wid][i] * val_B; } } diff --git a/library/src/level3/rocsparse_csrmm.hpp b/library/src/level3/rocsparse_csrmm.hpp index 627113c4..7f5145f8 100644 --- a/library/src/level3/rocsparse_csrmm.hpp +++ b/library/src/level3/rocsparse_csrmm.hpp @@ -13,7 +13,7 @@ #include -template +template __launch_bounds__(256) __global__ void csrmmnn_kernel_host_pointer(rocsparse_int m, rocsparse_int n, @@ -30,11 +30,11 @@ __launch_bounds__(256) __global__ rocsparse_int ldc, rocsparse_index_base idx_base) { - csrmmnn_general_device( + csrmmnn_general_device( m, n, k, nnz, alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, beta, C, ldc, idx_base); } -template +template __launch_bounds__(256) __global__ void csrmmnn_kernel_device_pointer(rocsparse_int m, rocsparse_int n, @@ -56,11 +56,11 @@ __launch_bounds__(256) __global__ return; } - csrmmnn_general_device( + csrmmnn_general_device( m, n, k, nnz, *alpha, csr_row_ptr, csr_col_ind, csr_val, B, ldb, *beta, C, ldc, idx_base); } -template +template __launch_bounds__(256) __global__ void csrmmnt_kernel_host_pointer(rocsparse_int offset, rocsparse_int ncol, @@ -79,7 +79,7 @@ __launch_bounds__(256) __global__ rocsparse_int ldc, rocsparse_index_base idx_base) { - csrmmnt_general_device(offset, + csrmmnt_general_device(offset, ncol, m, n, @@ -97,7 +97,7 @@ __launch_bounds__(256) __global__ idx_base); } -template +template __launch_bounds__(256) __global__ void csrmmnt_kernel_device_pointer(rocsparse_int offset, rocsparse_int ncol, @@ -121,7 +121,7 @@ __launch_bounds__(256) __global__ return; } - csrmmnt_general_device(offset, + csrmmnt_general_device(offset, ncol, m, n, @@ -465,7 +465,7 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, descr->base); } } - else if(avg_row_nnz < 64 || handle->warp_size == 32) + else if(avg_row_nnz < 64 || handle->wavefront_size == 32) { remainder = n % 32; main = n - remainder; @@ -496,7 +496,7 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, descr->base); } } - else if(handle->warp_size == 64) + else if(handle->wavefront_size == 64) { remainder = n % 64; main = n - remainder; @@ -583,7 +583,7 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, ldc, descr->base); } - else if(remainder <= 32 || handle->warp_size == 32) + else if(remainder <= 32 || handle->wavefront_size == 32) { hipLaunchKernelGGL((csrmmnt_kernel_device_pointer), dim3((32 * m - 1) / CSRMMNT_DIM + 1), @@ -711,7 +711,7 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, descr->base); } } - else if(avg_row_nnz < 64 || handle->warp_size == 32) + else if(avg_row_nnz < 64 || handle->wavefront_size == 32) { remainder = n % 32; main = n - remainder; @@ -742,7 +742,7 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, descr->base); } } - else if(handle->warp_size == 64) + else if(handle->wavefront_size == 64) { remainder = n % 64; main = n - remainder; @@ -829,7 +829,7 @@ rocsparse_status rocsparse_csrmm_template(rocsparse_handle handle, ldc, descr->base); } - else if(remainder <= 32 || handle->warp_size == 32) + else if(remainder <= 32 || handle->wavefront_size == 32) { hipLaunchKernelGGL((csrmmnt_kernel_host_pointer), dim3((32 * m - 1) / CSRMMNT_DIM + 1), From 68c3272f678f8c184ccca13dfee2b6a5f7915219 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 29 Aug 2018 07:58:40 +0200 Subject: [PATCH 226/304] removed emails from files --- docker/dockerfile-build-ubuntu | 2 +- docker/dockerfile-install-ubuntu | 2 +- library/CMakeLists.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/dockerfile-build-ubuntu b/docker/dockerfile-build-ubuntu index db00a038..f7042a2f 100644 --- a/docker/dockerfile-build-ubuntu +++ b/docker/dockerfile-build-ubuntu @@ -2,7 +2,7 @@ ARG base_image FROM ${base_image} -MAINTAINER Nico Trost +MAINTAINER Nico Trost ARG user_uid diff --git a/docker/dockerfile-install-ubuntu b/docker/dockerfile-install-ubuntu index 56ee71fb..bf490a5e 100644 --- a/docker/dockerfile-install-ubuntu +++ b/docker/dockerfile-install-ubuntu @@ -2,7 +2,7 @@ ARG base_image FROM ${base_image} -MAINTAINER Nico Trost +MAINTAINER Nico Trost # Copy the debian package of rocsparse into the container from host COPY *.deb /tmp/ diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index 48d3f916..0269b361 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -108,7 +108,7 @@ set(ROCSPARSE_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIB rocm_create_package( NAME ${package_name} DESCRIPTION "Radeon Open Compute SPARSE library" - MAINTAINER "Nico Trost " + MAINTAINER "Nico Trost" LDCONFIG LDCONFIG_DIR ${ROCSPARSE_CONFIG_DIR} ) From 58c7a310eb06631c4fdaab3aff42c12d491342a8 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 29 Aug 2018 08:06:45 +0200 Subject: [PATCH 227/304] added MIT license text to external files from crascit project --- cmake/DownloadProject.CMakeLists.cmake.in | 22 ++++++++++++++++++++++ cmake/DownloadProject.cmake | 22 ++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/cmake/DownloadProject.CMakeLists.cmake.in b/cmake/DownloadProject.CMakeLists.cmake.in index 5546c03a..064dbf72 100644 --- a/cmake/DownloadProject.CMakeLists.cmake.in +++ b/cmake/DownloadProject.CMakeLists.cmake.in @@ -1,5 +1,27 @@ # Distributed under the OSI-approved MIT License. See accompanying # file LICENSE or https://github.com/Crascit/DownloadProject for details. +# +# The MIT License (MIT) +# +# Copyright (c) 2015 Crascit +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. cmake_minimum_required(VERSION 2.8.2) diff --git a/cmake/DownloadProject.cmake b/cmake/DownloadProject.cmake index 54633d34..6b3c5277 100644 --- a/cmake/DownloadProject.cmake +++ b/cmake/DownloadProject.cmake @@ -1,6 +1,28 @@ # Distributed under the OSI-approved MIT License. See accompanying # file LICENSE or https://github.com/Crascit/DownloadProject for details. # +# The MIT License (MIT) +# +# Copyright (c) 2015 Crascit +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# # MODULE: DownloadProject # # PROVIDES: From 33bb6bfd28be2ef40b54991eb988b7b5af6c4ab6 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 29 Aug 2018 08:13:22 +0200 Subject: [PATCH 228/304] license text adjusted for all files --- library/include/rocsparse-types.h | 1 + library/include/rocsparse-version.h.in | 1 + library/include/rocsparse.h | 1 + library/src/CMakeLists.txt | 1 + library/src/conversion/coo2csr_device.h | 1 + library/src/conversion/coosort_device.h | 1 + library/src/conversion/csr2coo_device.h | 1 + library/src/conversion/csr2csc_device.h | 1 + library/src/conversion/csr2ell_device.h | 1 + library/src/conversion/csr2hyb_device.h | 1 + library/src/conversion/csrsort_device.h | 1 + library/src/conversion/ell2csr_device.h | 1 + library/src/conversion/identity_device.h | 1 + library/src/conversion/rocsparse_coo2csr.cpp | 1 + library/src/conversion/rocsparse_coosort.cpp | 1 + library/src/conversion/rocsparse_csr2coo.cpp | 1 + library/src/conversion/rocsparse_csr2csc.cpp | 1 + library/src/conversion/rocsparse_csr2csc.hpp | 1 + library/src/conversion/rocsparse_csr2ell.cpp | 1 + library/src/conversion/rocsparse_csr2ell.hpp | 1 + library/src/conversion/rocsparse_csr2hyb.cpp | 1 + library/src/conversion/rocsparse_csr2hyb.hpp | 1 + library/src/conversion/rocsparse_csrsort.cpp | 1 + library/src/conversion/rocsparse_ell2csr.cpp | 1 + library/src/conversion/rocsparse_ell2csr.hpp | 1 + library/src/conversion/rocsparse_identity.cpp | 1 + library/src/handle.cpp | 1 + library/src/include/handle.h | 1 + library/src/include/logging.h | 1 + library/src/include/utility.h | 1 + library/src/level1/axpyi_device.h | 1 + library/src/level1/dotci_device.h | 4 ++++ library/src/level1/doti_device.h | 1 + library/src/level1/gthr_device.h | 1 + library/src/level1/gthrz_device.h | 1 + library/src/level1/rocsparse_axpyi.cpp | 1 + library/src/level1/rocsparse_axpyi.hpp | 1 + library/src/level1/rocsparse_dotci.cpp | 1 + library/src/level1/rocsparse_dotci.hpp | 1 + library/src/level1/rocsparse_doti.cpp | 1 + library/src/level1/rocsparse_doti.hpp | 1 + library/src/level1/rocsparse_gthr.cpp | 1 + library/src/level1/rocsparse_gthr.hpp | 1 + library/src/level1/rocsparse_gthrz.cpp | 1 + library/src/level1/rocsparse_gthrz.hpp | 1 + library/src/level1/rocsparse_roti.cpp | 1 + library/src/level1/rocsparse_roti.hpp | 1 + library/src/level1/rocsparse_sctr.cpp | 1 + library/src/level1/rocsparse_sctr.hpp | 1 + library/src/level1/roti_device.h | 1 + library/src/level1/sctr_device.h | 1 + library/src/level2/coomv_device.h | 1 + library/src/level2/csrmv_device.h | 5 +++++ library/src/level2/ellmv_device.h | 5 +++++ library/src/level2/rocsparse_coomv.cpp | 1 + library/src/level2/rocsparse_coomv.hpp | 1 + library/src/level2/rocsparse_csrmv.cpp | 1 + library/src/level2/rocsparse_csrmv.hpp | 1 + library/src/level2/rocsparse_ellmv.cpp | 1 + library/src/level2/rocsparse_ellmv.hpp | 1 + library/src/level2/rocsparse_hybmv.cpp | 1 + library/src/level2/rocsparse_hybmv.hpp | 1 + library/src/level3/csrmm_device.h | 5 +++++ library/src/level3/rocsparse_csrmm.cpp | 1 + library/src/level3/rocsparse_csrmm.hpp | 1 + library/src/rocsparse_auxiliary.cpp | 1 + 66 files changed, 81 insertions(+) diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index b3e272cb..0df18e8d 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ /*! \file diff --git a/library/include/rocsparse-version.h.in b/library/include/rocsparse-version.h.in index f6e424e4..f396681c 100644 --- a/library/include/rocsparse-version.h.in +++ b/library/include/rocsparse-version.h.in @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ /*!\file diff --git a/library/include/rocsparse.h b/library/include/rocsparse.h index ea703562..c6f6757a 100644 --- a/library/include/rocsparse.h +++ b/library/include/rocsparse.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ /*!\file diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 4d235e16..215585b8 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -1,5 +1,6 @@ # ######################################################################## # Copyright 2018 Advanced Micro Devices, Inc. +# # ######################################################################## # rocSPARSE source diff --git a/library/src/conversion/coo2csr_device.h b/library/src/conversion/coo2csr_device.h index 8f2afddf..878427a6 100644 --- a/library/src/conversion/coo2csr_device.h +++ b/library/src/conversion/coo2csr_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/coosort_device.h b/library/src/conversion/coosort_device.h index ab8700f2..34f8ed7b 100644 --- a/library/src/conversion/coosort_device.h +++ b/library/src/conversion/coosort_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/csr2coo_device.h b/library/src/conversion/csr2coo_device.h index 5b7cdbf6..82b936d1 100644 --- a/library/src/conversion/csr2coo_device.h +++ b/library/src/conversion/csr2coo_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/csr2csc_device.h b/library/src/conversion/csr2csc_device.h index 209480ca..e17c90d0 100644 --- a/library/src/conversion/csr2csc_device.h +++ b/library/src/conversion/csr2csc_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/csr2ell_device.h b/library/src/conversion/csr2ell_device.h index 98219238..b42590b6 100644 --- a/library/src/conversion/csr2ell_device.h +++ b/library/src/conversion/csr2ell_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/csr2hyb_device.h b/library/src/conversion/csr2hyb_device.h index c536ddbb..77bb5654 100644 --- a/library/src/conversion/csr2hyb_device.h +++ b/library/src/conversion/csr2hyb_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/csrsort_device.h b/library/src/conversion/csrsort_device.h index 815579f7..7f58805e 100644 --- a/library/src/conversion/csrsort_device.h +++ b/library/src/conversion/csrsort_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/ell2csr_device.h b/library/src/conversion/ell2csr_device.h index d50efbe4..a240cd57 100644 --- a/library/src/conversion/ell2csr_device.h +++ b/library/src/conversion/ell2csr_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/identity_device.h b/library/src/conversion/identity_device.h index f59930eb..323dbbbf 100644 --- a/library/src/conversion/identity_device.h +++ b/library/src/conversion/identity_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/rocsparse_coo2csr.cpp b/library/src/conversion/rocsparse_coo2csr.cpp index 1eb902d4..46df2cc9 100644 --- a/library/src/conversion/rocsparse_coo2csr.cpp +++ b/library/src/conversion/rocsparse_coo2csr.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/conversion/rocsparse_coosort.cpp b/library/src/conversion/rocsparse_coosort.cpp index 94c5d4fa..f2610465 100644 --- a/library/src/conversion/rocsparse_coosort.cpp +++ b/library/src/conversion/rocsparse_coosort.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/conversion/rocsparse_csr2coo.cpp b/library/src/conversion/rocsparse_csr2coo.cpp index 7baaafe8..3b4df2cf 100644 --- a/library/src/conversion/rocsparse_csr2coo.cpp +++ b/library/src/conversion/rocsparse_csr2coo.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/conversion/rocsparse_csr2csc.cpp b/library/src/conversion/rocsparse_csr2csc.cpp index a7b29755..ece9d7fd 100644 --- a/library/src/conversion/rocsparse_csr2csc.cpp +++ b/library/src/conversion/rocsparse_csr2csc.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/conversion/rocsparse_csr2csc.hpp b/library/src/conversion/rocsparse_csr2csc.hpp index e34187df..b2eb4ad5 100644 --- a/library/src/conversion/rocsparse_csr2csc.hpp +++ b/library/src/conversion/rocsparse_csr2csc.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/rocsparse_csr2ell.cpp b/library/src/conversion/rocsparse_csr2ell.cpp index 1505dcc8..40efe439 100644 --- a/library/src/conversion/rocsparse_csr2ell.cpp +++ b/library/src/conversion/rocsparse_csr2ell.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/conversion/rocsparse_csr2ell.hpp b/library/src/conversion/rocsparse_csr2ell.hpp index 3e19d308..59c9d395 100644 --- a/library/src/conversion/rocsparse_csr2ell.hpp +++ b/library/src/conversion/rocsparse_csr2ell.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/rocsparse_csr2hyb.cpp b/library/src/conversion/rocsparse_csr2hyb.cpp index 891a321c..926e8b33 100644 --- a/library/src/conversion/rocsparse_csr2hyb.cpp +++ b/library/src/conversion/rocsparse_csr2hyb.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/conversion/rocsparse_csr2hyb.hpp b/library/src/conversion/rocsparse_csr2hyb.hpp index 83fd91b3..f290c54c 100644 --- a/library/src/conversion/rocsparse_csr2hyb.hpp +++ b/library/src/conversion/rocsparse_csr2hyb.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 5e8642bb..7c5e2f14 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/conversion/rocsparse_ell2csr.cpp b/library/src/conversion/rocsparse_ell2csr.cpp index 17bca740..33c8b28d 100644 --- a/library/src/conversion/rocsparse_ell2csr.cpp +++ b/library/src/conversion/rocsparse_ell2csr.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "definitions.h" diff --git a/library/src/conversion/rocsparse_ell2csr.hpp b/library/src/conversion/rocsparse_ell2csr.hpp index b5ba34ec..f6d4439e 100644 --- a/library/src/conversion/rocsparse_ell2csr.hpp +++ b/library/src/conversion/rocsparse_ell2csr.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/conversion/rocsparse_identity.cpp b/library/src/conversion/rocsparse_identity.cpp index bba49c98..fb5e85d3 100644 --- a/library/src/conversion/rocsparse_identity.cpp +++ b/library/src/conversion/rocsparse_identity.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/handle.cpp b/library/src/handle.cpp index a6081f7d..547416ba 100644 --- a/library/src/handle.cpp +++ b/library/src/handle.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "definitions.h" diff --git a/library/src/include/handle.h b/library/src/include/handle.h index 71f32758..f0bb4716 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/include/logging.h b/library/src/include/logging.h index 543cd308..0b4376ed 100644 --- a/library/src/include/logging.h +++ b/library/src/include/logging.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/include/utility.h b/library/src/include/utility.h index fe4aa6db..9843a24e 100644 --- a/library/src/include/utility.h +++ b/library/src/include/utility.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/axpyi_device.h b/library/src/level1/axpyi_device.h index 28c5b2e9..f04d32e9 100644 --- a/library/src/level1/axpyi_device.h +++ b/library/src/level1/axpyi_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/dotci_device.h b/library/src/level1/dotci_device.h index e69de29b..4ee49bd2 100644 --- a/library/src/level1/dotci_device.h +++ b/library/src/level1/dotci_device.h @@ -0,0 +1,4 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ diff --git a/library/src/level1/doti_device.h b/library/src/level1/doti_device.h index 625d22b4..e4e7916f 100644 --- a/library/src/level1/doti_device.h +++ b/library/src/level1/doti_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/gthr_device.h b/library/src/level1/gthr_device.h index 0a52226c..95d86cf3 100644 --- a/library/src/level1/gthr_device.h +++ b/library/src/level1/gthr_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/gthrz_device.h b/library/src/level1/gthrz_device.h index 458941e8..c0b4a0ac 100644 --- a/library/src/level1/gthrz_device.h +++ b/library/src/level1/gthrz_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/rocsparse_axpyi.cpp b/library/src/level1/rocsparse_axpyi.cpp index 0007a89d..c39b4a9b 100644 --- a/library/src/level1/rocsparse_axpyi.cpp +++ b/library/src/level1/rocsparse_axpyi.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level1/rocsparse_axpyi.hpp b/library/src/level1/rocsparse_axpyi.hpp index f620fe1b..88befbce 100644 --- a/library/src/level1/rocsparse_axpyi.hpp +++ b/library/src/level1/rocsparse_axpyi.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/rocsparse_dotci.cpp b/library/src/level1/rocsparse_dotci.cpp index f99b3c30..405ba322 100644 --- a/library/src/level1/rocsparse_dotci.cpp +++ b/library/src/level1/rocsparse_dotci.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level1/rocsparse_dotci.hpp b/library/src/level1/rocsparse_dotci.hpp index 8d642e7b..739d1b14 100644 --- a/library/src/level1/rocsparse_dotci.hpp +++ b/library/src/level1/rocsparse_dotci.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/rocsparse_doti.cpp b/library/src/level1/rocsparse_doti.cpp index d09b5401..a278abdc 100644 --- a/library/src/level1/rocsparse_doti.cpp +++ b/library/src/level1/rocsparse_doti.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level1/rocsparse_doti.hpp b/library/src/level1/rocsparse_doti.hpp index a96ac195..d3b9e781 100644 --- a/library/src/level1/rocsparse_doti.hpp +++ b/library/src/level1/rocsparse_doti.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/rocsparse_gthr.cpp b/library/src/level1/rocsparse_gthr.cpp index e2ac4fd3..3bc3bd49 100644 --- a/library/src/level1/rocsparse_gthr.cpp +++ b/library/src/level1/rocsparse_gthr.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level1/rocsparse_gthr.hpp b/library/src/level1/rocsparse_gthr.hpp index 3a87c7b3..a3cb87e0 100644 --- a/library/src/level1/rocsparse_gthr.hpp +++ b/library/src/level1/rocsparse_gthr.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/rocsparse_gthrz.cpp b/library/src/level1/rocsparse_gthrz.cpp index 283df86e..ba686537 100644 --- a/library/src/level1/rocsparse_gthrz.cpp +++ b/library/src/level1/rocsparse_gthrz.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level1/rocsparse_gthrz.hpp b/library/src/level1/rocsparse_gthrz.hpp index 3898db08..5115fd89 100644 --- a/library/src/level1/rocsparse_gthrz.hpp +++ b/library/src/level1/rocsparse_gthrz.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/rocsparse_roti.cpp b/library/src/level1/rocsparse_roti.cpp index 05c29d9d..f47e272d 100644 --- a/library/src/level1/rocsparse_roti.cpp +++ b/library/src/level1/rocsparse_roti.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level1/rocsparse_roti.hpp b/library/src/level1/rocsparse_roti.hpp index 815318a1..6fc69106 100644 --- a/library/src/level1/rocsparse_roti.hpp +++ b/library/src/level1/rocsparse_roti.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/rocsparse_sctr.cpp b/library/src/level1/rocsparse_sctr.cpp index 96026ff8..7311b158 100644 --- a/library/src/level1/rocsparse_sctr.cpp +++ b/library/src/level1/rocsparse_sctr.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level1/rocsparse_sctr.hpp b/library/src/level1/rocsparse_sctr.hpp index 07fad1be..b9078d55 100644 --- a/library/src/level1/rocsparse_sctr.hpp +++ b/library/src/level1/rocsparse_sctr.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/roti_device.h b/library/src/level1/roti_device.h index 4150bb74..07135c83 100644 --- a/library/src/level1/roti_device.h +++ b/library/src/level1/roti_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level1/sctr_device.h b/library/src/level1/sctr_device.h index 1fcc4304..b536201a 100644 --- a/library/src/level1/sctr_device.h +++ b/library/src/level1/sctr_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h index 4d1003ff..70887311 100644 --- a/library/src/level2/coomv_device.h +++ b/library/src/level2/coomv_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index 03838101..c788c8f3 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -1,3 +1,8 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + #pragma once #ifndef CSRMV_DEVICE_H #define CSRMV_DEVICE_H diff --git a/library/src/level2/ellmv_device.h b/library/src/level2/ellmv_device.h index ce61210a..e4e79699 100644 --- a/library/src/level2/ellmv_device.h +++ b/library/src/level2/ellmv_device.h @@ -1,3 +1,8 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + #pragma once #ifndef ELLMV_DEVICE_H #define ELLMV_DEVICE_H diff --git a/library/src/level2/rocsparse_coomv.cpp b/library/src/level2/rocsparse_coomv.cpp index 041a9368..155fd65e 100644 --- a/library/src/level2/rocsparse_coomv.cpp +++ b/library/src/level2/rocsparse_coomv.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level2/rocsparse_coomv.hpp b/library/src/level2/rocsparse_coomv.hpp index 078ffd73..2261716a 100644 --- a/library/src/level2/rocsparse_coomv.hpp +++ b/library/src/level2/rocsparse_coomv.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index e83311df..3d2e3ae8 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "definitions.h" diff --git a/library/src/level2/rocsparse_csrmv.hpp b/library/src/level2/rocsparse_csrmv.hpp index bacd214c..d397fa14 100644 --- a/library/src/level2/rocsparse_csrmv.hpp +++ b/library/src/level2/rocsparse_csrmv.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level2/rocsparse_ellmv.cpp b/library/src/level2/rocsparse_ellmv.cpp index 703104f5..dea9c13e 100644 --- a/library/src/level2/rocsparse_ellmv.cpp +++ b/library/src/level2/rocsparse_ellmv.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level2/rocsparse_ellmv.hpp b/library/src/level2/rocsparse_ellmv.hpp index ae897766..19cb193d 100644 --- a/library/src/level2/rocsparse_ellmv.hpp +++ b/library/src/level2/rocsparse_ellmv.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level2/rocsparse_hybmv.cpp b/library/src/level2/rocsparse_hybmv.cpp index 933748eb..25e2cf35 100644 --- a/library/src/level2/rocsparse_hybmv.cpp +++ b/library/src/level2/rocsparse_hybmv.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level2/rocsparse_hybmv.hpp b/library/src/level2/rocsparse_hybmv.hpp index 092b0b3d..5d79eff7 100644 --- a/library/src/level2/rocsparse_hybmv.hpp +++ b/library/src/level2/rocsparse_hybmv.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/level3/csrmm_device.h b/library/src/level3/csrmm_device.h index 1c18feda..504bf6f3 100644 --- a/library/src/level3/csrmm_device.h +++ b/library/src/level3/csrmm_device.h @@ -1,3 +1,8 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + #pragma once #ifndef CSRMM_DEVICE_H #define CSRMM_DEVICE_H diff --git a/library/src/level3/rocsparse_csrmm.cpp b/library/src/level3/rocsparse_csrmm.cpp index 2a903983..376cbbb6 100644 --- a/library/src/level3/rocsparse_csrmm.cpp +++ b/library/src/level3/rocsparse_csrmm.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "rocsparse.h" diff --git a/library/src/level3/rocsparse_csrmm.hpp b/library/src/level3/rocsparse_csrmm.hpp index 7f5145f8..3f66e1c3 100644 --- a/library/src/level3/rocsparse_csrmm.hpp +++ b/library/src/level3/rocsparse_csrmm.hpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index fb3490fe..d4de1825 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "definitions.h" From eca48d12031e65b7f5afcfab770e9b19e4d44397 Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 29 Aug 2018 20:53:10 +0200 Subject: [PATCH 229/304] license file for downloadproject added --- cmake/Dependencies.cmake | 2 +- .../DownloadProject.CMakeLists.cmake.in | 0 .../DownloadProject.cmake | 0 cmake/DownloadProject/LICENSE | 22 +++++++++++++++++++ 4 files changed, 23 insertions(+), 1 deletion(-) rename cmake/{ => DownloadProject}/DownloadProject.CMakeLists.cmake.in (100%) rename cmake/{ => DownloadProject}/DownloadProject.cmake (100%) create mode 100644 cmake/DownloadProject/LICENSE diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index cb46a8e7..58dcb798 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -8,7 +8,7 @@ find_package(Git REQUIRED) # DownloadProject package -include(cmake/DownloadProject.cmake) +include(cmake/DownloadProject/DownloadProject.cmake) # HIP configuration if(HIP_PLATFORM STREQUAL "hcc") diff --git a/cmake/DownloadProject.CMakeLists.cmake.in b/cmake/DownloadProject/DownloadProject.CMakeLists.cmake.in similarity index 100% rename from cmake/DownloadProject.CMakeLists.cmake.in rename to cmake/DownloadProject/DownloadProject.CMakeLists.cmake.in diff --git a/cmake/DownloadProject.cmake b/cmake/DownloadProject/DownloadProject.cmake similarity index 100% rename from cmake/DownloadProject.cmake rename to cmake/DownloadProject/DownloadProject.cmake diff --git a/cmake/DownloadProject/LICENSE b/cmake/DownloadProject/LICENSE new file mode 100644 index 00000000..66a28262 --- /dev/null +++ b/cmake/DownloadProject/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Crascit + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + From 9e09e33e315d5fb5efb8036a5463c61e76d8713a Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 29 Aug 2018 22:26:55 +0200 Subject: [PATCH 230/304] downloadproject one dir up --- cmake/Dependencies.cmake | 2 +- cmake/{DownloadProject => }/DownloadProject.CMakeLists.cmake.in | 0 cmake/{DownloadProject => }/DownloadProject.cmake | 0 cmake/{DownloadProject => }/LICENSE | 0 4 files changed, 1 insertion(+), 1 deletion(-) rename cmake/{DownloadProject => }/DownloadProject.CMakeLists.cmake.in (100%) rename cmake/{DownloadProject => }/DownloadProject.cmake (100%) rename cmake/{DownloadProject => }/LICENSE (100%) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 58dcb798..cb46a8e7 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -8,7 +8,7 @@ find_package(Git REQUIRED) # DownloadProject package -include(cmake/DownloadProject/DownloadProject.cmake) +include(cmake/DownloadProject.cmake) # HIP configuration if(HIP_PLATFORM STREQUAL "hcc") diff --git a/cmake/DownloadProject/DownloadProject.CMakeLists.cmake.in b/cmake/DownloadProject.CMakeLists.cmake.in similarity index 100% rename from cmake/DownloadProject/DownloadProject.CMakeLists.cmake.in rename to cmake/DownloadProject.CMakeLists.cmake.in diff --git a/cmake/DownloadProject/DownloadProject.cmake b/cmake/DownloadProject.cmake similarity index 100% rename from cmake/DownloadProject/DownloadProject.cmake rename to cmake/DownloadProject.cmake diff --git a/cmake/DownloadProject/LICENSE b/cmake/LICENSE similarity index 100% rename from cmake/DownloadProject/LICENSE rename to cmake/LICENSE From 935c930bc893e06268eab1f5570c8f315daa0f7b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 30 Aug 2018 06:56:33 +0000 Subject: [PATCH 231/304] workaround to fix issues with old docker images and rocprim (until we run 1.8.2 based tests) --- cmake/Dependencies.cmake | 4 ++-- .../{ => DownloadProject}/DownloadProject.CMakeLists.cmake.in | 0 cmake/{ => DownloadProject}/DownloadProject.cmake | 0 cmake/{ => DownloadProject}/LICENSE | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename cmake/{ => DownloadProject}/DownloadProject.CMakeLists.cmake.in (100%) rename cmake/{ => DownloadProject}/DownloadProject.cmake (100%) rename cmake/{ => DownloadProject}/LICENSE (100%) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index cb46a8e7..6c99037a 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -8,7 +8,7 @@ find_package(Git REQUIRED) # DownloadProject package -include(cmake/DownloadProject.cmake) +include(cmake/DownloadProject/DownloadProject.cmake) # HIP configuration if(HIP_PLATFORM STREQUAL "hcc") @@ -47,7 +47,7 @@ if(HIP_PLATFORM STREQUAL "hcc") message(STATUS "Downloading rocPRIM.") download_project(PROJ rocPRIM GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git - GIT_TAG master + GIT_TAG 0702a4a797f021bf7098d6b8cca046c85f4cb22d # TODO change back to master once rocm docker image is updated INSTALL_DIR ${ROCPRIM_ROOT} CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hcc LOG_DOWNLOAD TRUE diff --git a/cmake/DownloadProject.CMakeLists.cmake.in b/cmake/DownloadProject/DownloadProject.CMakeLists.cmake.in similarity index 100% rename from cmake/DownloadProject.CMakeLists.cmake.in rename to cmake/DownloadProject/DownloadProject.CMakeLists.cmake.in diff --git a/cmake/DownloadProject.cmake b/cmake/DownloadProject/DownloadProject.cmake similarity index 100% rename from cmake/DownloadProject.cmake rename to cmake/DownloadProject/DownloadProject.cmake diff --git a/cmake/LICENSE b/cmake/DownloadProject/LICENSE similarity index 100% rename from cmake/LICENSE rename to cmake/DownloadProject/LICENSE From b60e50743d999f4aa1f9f56b2235559463d8dd0a Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 30 Aug 2018 07:01:37 +0000 Subject: [PATCH 232/304] actually took the wrong version of rocPRIM - should work now --- cmake/Dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 6c99037a..b6538fcc 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -47,7 +47,7 @@ if(HIP_PLATFORM STREQUAL "hcc") message(STATUS "Downloading rocPRIM.") download_project(PROJ rocPRIM GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git - GIT_TAG 0702a4a797f021bf7098d6b8cca046c85f4cb22d # TODO change back to master once rocm docker image is updated + GIT_TAG caef132d64b29a7d857eb68af5323fc302d26766 # TODO change back to master once rocm docker image is updated INSTALL_DIR ${ROCPRIM_ROOT} CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hcc LOG_DOWNLOAD TRUE From ab15a67c62c14aa623f8069c999e01b7dc2feace Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Fri, 31 Aug 2018 11:35:39 +0200 Subject: [PATCH 233/304] added -d parameter for benchmark to select device --- clients/benchmarks/client.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index cb06faae..829e02cf 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -102,7 +102,7 @@ int main(int argc, char* argv[]) po::value(&argus.iters)->default_value(10), "Iterations to run inside timing loop") - ("device", + ("device,d", po::value(&device_id)->default_value(0), "Set default device to be used for subsequent program runs"); // clang-format on From 57648bee9e350ad2229c8ff55a550144b71cfccf Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 5 Sep 2018 12:25:58 +0200 Subject: [PATCH 234/304] added csrtr_info struct, fill mode mechanism, diag type mechanism and solve/analysis policies --- library/include/rocsparse-auxiliary.h | 23 ++++++ library/include/rocsparse-types.h | 23 ++++++ library/src/handle.cpp | 66 +++++++++++++++++ library/src/include/handle.h | 46 ++++++++++-- library/src/rocsparse_auxiliary.cpp | 102 +++++++++++++++++++++++++- 5 files changed, 254 insertions(+), 6 deletions(-) diff --git a/library/include/rocsparse-auxiliary.h b/library/include/rocsparse-auxiliary.h index ea19ceb3..a1513caa 100644 --- a/library/include/rocsparse-auxiliary.h +++ b/library/include/rocsparse-auxiliary.h @@ -111,6 +111,29 @@ rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, rocsparse_mat ROCSPARSE_EXPORT rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr); + + + + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_set_mat_fill_mode(rocsparse_mat_descr descr, rocsparse_fill_mode fill_mode); + +ROCSPARSE_EXPORT +rocsparse_fill_mode rocsparse_get_mat_fill_mode(const rocsparse_mat_descr descr); + + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_set_mat_diag_type(rocsparse_mat_descr descr, rocsparse_diag_type diag_type); + +ROCSPARSE_EXPORT +rocsparse_diag_type rocsparse_get_mat_diag_type(const rocsparse_mat_descr descr); + + + + + + + /******************************************************************************** * \brief rocsparse_hyb_mat is a structure holding the rocsparse HYB matrix. It * must be initialized using rocsparse_create_hyb_mat() and the returned handle diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index 0df18e8d..3b662855 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -55,6 +55,18 @@ typedef enum rocsparse_matrix_type_ { rocsparse_matrix_type_triangular = 3 /**< triangular matrix type. */ } rocsparse_matrix_type; +/*! \brief Used to indicate if the diagonal entries are unity. */ +typedef enum rocsparse_diag_type_ { + rocsparse_diag_type_non_unit = 0, /**< diagonal entries are non-unity. */ + rocsparse_diag_type_unit = 1 /**< diagonal entries are unity */ +} rocsparse_diag_type; + +/*! \brief Used to specify the matrix fill mode. */ +typedef enum rocsparse_fill_mode_ { + rocsparse_fill_mode_lower = 0, /**< lower triangular part is stored. */ + rocsparse_fill_mode_upper = 1 /**< upper triangular part is stored. */ +} rocsparse_fill_mode; + /*! \brief Used to specify where the operation is performed on. */ typedef enum rocsparse_action_ { rocsparse_action_symbolic = 0, /**< Operate only on indices. */ @@ -68,6 +80,17 @@ typedef enum rocsparse_hyb_partition_ { rocsparse_hyb_partition_max = 2 /**< max ELL nnz per row, no COO part. */ } rocsparse_hyb_partition; +/*! \brief Used to specify policy in triangular solvers and factorizations. */ +typedef enum rocsparse_solve_policy_ { + rocsparse_solve_policy_auto = 0 /**< automatically decide on level information. */ +} rocsparse_solve_policy; + +/*! \brief Used to specify policy in analysis functions. */ +typedef enum rocsparse_analysis_policy_ { + rocsparse_analysis_policy_reuse = 0, + rocsparse_analysis_policy_force = 1 +} rocsparse_analysis_policy; + /* ==================================================================================== */ /** * @brief rocsparse status codes definition. diff --git a/library/src/handle.cpp b/library/src/handle.cpp index 547416ba..5810dd8d 100644 --- a/library/src/handle.cpp +++ b/library/src/handle.cpp @@ -140,3 +140,69 @@ rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info) } return rocsparse_status_success; } + + + + + + + + + + + + + + +rocsparse_status rocsparse_create_csrtr_info(rocsparse_csrtr_info* info) +{ + if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else + { + // Allocate + try + { + *info = new _rocsparse_csrtr_info; + } + catch(const rocsparse_status& status) + { + return status; + } + return rocsparse_status_success; + } +} + +rocsparse_status rocsparse_destroy_csrtr_info(rocsparse_csrtr_info info) +{ + if(info == nullptr) + { + return rocsparse_status_success; + } + + // Clean up + if(info->row_map != nullptr) + { + RETURN_IF_HIP_ERROR(hipFree(info->row_map)); + info->row_map = nullptr; + } + + if(info->csr_diag_ind != nullptr) + { + RETURN_IF_HIP_ERROR(hipFree(info->csr_diag_ind)); + info->csr_diag_ind = nullptr; + } + + // Destruct + try + { + delete info; + } + catch(const rocsparse_status& status) + { + return status; + } + return rocsparse_status_success; +} diff --git a/library/src/include/handle.h b/library/src/include/handle.h index f0bb4716..264a9c2f 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -11,10 +11,12 @@ #include #include +#include #include /*! \brief typedefs to opaque info structs */ typedef struct _rocsparse_csrmv_info* rocsparse_csrmv_info; +typedef struct _rocsparse_csrtr_info* rocsparse_csrtr_info; /******************************************************************************** * \brief rocsparse_handle is a structure holding the rocsparse library context. @@ -66,10 +68,10 @@ struct _rocsparse_mat_descr { // Matrix type rocsparse_matrix_type type = rocsparse_matrix_type_general; - // Fill mode TODO - // rocsparse_fill_mode fill; + // Fill mode + rocsparse_fill_mode fill_mode = rocsparse_fill_mode_lower; // Diagonal type - // rocsparse_diag_type diag; + rocsparse_diag_type diag_type = rocsparse_diag_type_non_unit; // Index base rocsparse_index_base base = rocsparse_index_base_zero; }; @@ -114,10 +116,13 @@ struct _rocsparse_hyb_mat struct _rocsparse_mat_info { // built flags - bool csrmv_built = false; + bool csrmv_built = false; // info structs - rocsparse_csrmv_info csrmv_info = nullptr; + rocsparse_csrmv_info csrmv_info = nullptr; + rocsparse_csrtr_info csrilu0_info = nullptr; + rocsparse_csrtr_info csrsv_upper_info = nullptr; + rocsparse_csrtr_info csrsv_lower_info = nullptr; }; /******************************************************************************** @@ -156,6 +161,37 @@ rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info); *******************************************************************************/ rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info); + + + + + +struct _rocsparse_csrtr_info +{ + rocsparse_int max_depth; + unsigned long long total_spin; + rocsparse_int max_nnz; + + std::vector rows_per_level; + rocsparse_int* row_map = nullptr; + rocsparse_int* csr_diag_ind = nullptr; + + // some data to verify correct execution + rocsparse_int m; + rocsparse_int nnz; + const _rocsparse_mat_descr* descr; + const rocsparse_int* csr_row_ptr; + const rocsparse_int* csr_col_ind; +}; + +rocsparse_status rocsparse_create_csrtr_info(rocsparse_csrtr_info* info); + +rocsparse_status rocsparse_destroy_csrtr_info(rocsparse_csrtr_info info); + + + + + /******************************************************************************** * \brief ELL format indexing *******************************************************************************/ diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index d4de1825..1aa1c097 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -252,6 +252,81 @@ rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr) return descr->type; } + + + + + + + + + + + +rocsparse_status rocsparse_set_mat_fill_mode(rocsparse_mat_descr descr, rocsparse_fill_mode fill_mode) +{ + // Check if descriptor is valid + if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + if(fill_mode != rocsparse_fill_mode_lower && fill_mode != rocsparse_fill_mode_upper) + { + return rocsparse_status_invalid_value; + } + descr->fill_mode = fill_mode; + return rocsparse_status_success; +} + +rocsparse_fill_mode rocsparse_get_mat_fill_mode(const rocsparse_mat_descr descr) +{ + // If descriptor is invalid, default fill mode is returned + if(descr == nullptr) + { + return rocsparse_fill_mode_lower; + } + return descr->fill_mode; +} + + +rocsparse_status rocsparse_set_mat_diag_type(rocsparse_mat_descr descr, rocsparse_diag_type diag_type) +{ + // Check if descriptor is valid + if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + if(diag_type != rocsparse_diag_type_unit && diag_type != rocsparse_diag_type_non_unit) + { + return rocsparse_status_invalid_value; + } + descr->diag_type = diag_type; + return rocsparse_status_success; +} + +rocsparse_diag_type rocsparse_get_mat_diag_type(const rocsparse_mat_descr descr) +{ + // If descriptor is invalid, default diagonal type is returned + if(descr == nullptr) + { + return rocsparse_diag_type_non_unit; + } + return descr->diag_type; +} + + + + + + + + + + + + + + /******************************************************************************** * \brief rocsparse_create_hyb_mat is a structure holding the rocsparse HYB * matrix. It must be initialized using rocsparse_create_hyb_mat() @@ -359,12 +434,37 @@ rocsparse_status rocsparse_destroy_mat_info(rocsparse_mat_info info) return rocsparse_status_success; } + // Uncouple shared meta data + // TODO add more crossover data here + if(info->csrsv_lower_info == info->csrilu0_info) + { + info->csrsv_lower_info = nullptr; + } + // Clear csrmv info struct - if(info->csrmv_built == true) + if(info->csrmv_info != nullptr) { RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrmv_info(info->csrmv_info)); } + // Clear csrilu0 info struct + if(info->csrilu0_info != nullptr) + { + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrilu0_info)); + } + + // Clear csrsv upper info struct + if(info->csrsv_upper_info != nullptr) + { + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_upper_info)); + } + + // Clear csrsv lower info struct + if(info->csrsv_lower_info != nullptr) + { + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_lower_info)); + } + // Destruct try { From 0851b2dcc71a260f60225877096cb69baed6513d Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 5 Sep 2018 12:27:44 +0200 Subject: [PATCH 235/304] initial working versions of csrilu0 and csrsv --- library/include/rocsparse-functions.h | 154 +++++++ library/src/CMakeLists.txt | 4 + library/src/level2/csrsv_device.h | 380 ++++++++++++++++ library/src/level2/rocsparse_csrsv.cpp | 343 +++++++++++++++ library/src/level2/rocsparse_csrsv.hpp | 505 ++++++++++++++++++++++ library/src/precond/csrilu0_device.h | 323 ++++++++++++++ library/src/precond/rocsparse_csrilu0.cpp | 211 +++++++++ library/src/precond/rocsparse_csrilu0.hpp | 266 ++++++++++++ 8 files changed, 2186 insertions(+) create mode 100644 library/src/level2/csrsv_device.h create mode 100644 library/src/level2/rocsparse_csrsv.cpp create mode 100644 library/src/level2/rocsparse_csrsv.hpp create mode 100644 library/src/precond/csrilu0_device.h create mode 100644 library/src/precond/rocsparse_csrilu0.cpp create mode 100644 library/src/precond/rocsparse_csrilu0.hpp diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index 4111afd5..bbdfa7c4 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -703,6 +703,87 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, rocsparse_double_complex* y); */ + + + + + + + + + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrsv_buffer_size(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy solve, + rocsparse_analysis_policy analysis, + void* temp_buffer); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrsv_clear(const rocsparse_mat_descr descr, + rocsparse_mat_info info); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsrsv_solve(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + const float* x, + float* y, + rocsparse_solve_policy policy, + void* temp_buffer); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsrsv_solve(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + const double* x, + double* y, + rocsparse_solve_policy policy, + void* temp_buffer); + + + + + + + + + + + + + /*! \brief SPARSE Level 2 API \details @@ -1003,6 +1084,79 @@ rocsparse_status rocsparse_zcsrmm(rocsparse_handle handle, rocsparse_int ldc); */ + + + + + + + +//TODO +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrilu0_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrilu0_analysis(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy solve, + rocsparse_analysis_policy analysis, + void* temp_buffer); + + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrilu0_clear(rocsparse_mat_info info); + + + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsrilu0(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy policy, + void* temp_buffer); + + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsrilu0(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy policy, + void* temp_buffer); + + + + + + + + + + + + + /* * =========================================================================== * Sparse Format Conversions diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 215585b8..b9a23b53 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -21,12 +21,16 @@ set(rocsparse_source # Level2 src/level2/rocsparse_coomv.cpp src/level2/rocsparse_csrmv.cpp + src/level2/rocsparse_csrsv.cpp src/level2/rocsparse_ellmv.cpp src/level2/rocsparse_hybmv.cpp # Level3 src/level3/rocsparse_csrmm.cpp +# Preconditioner + src/precond/rocsparse_csrilu0.cpp + # Conversion src/conversion/rocsparse_csr2coo.cpp src/conversion/rocsparse_csr2csc.cpp diff --git a/library/src/level2/csrsv_device.h b/library/src/level2/csrsv_device.h new file mode 100644 index 00000000..7069f1ad --- /dev/null +++ b/library/src/level2/csrsv_device.h @@ -0,0 +1,380 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef CSRSV_DEVICE_H +#define CSRSV_DEVICE_H + +#include + +template +static __device__ __inline__ void two_reduce(int* local_max, int *local_spin) +{ +#if defined(__HIP_PLATFORM_HCC__) + int max_depth = *local_max; + + if(WF_SIZE > 1) + { + // row_shr = 1 + max_depth = __hip_move_dpp(*local_max, 0x111, 0xf, 0xf, 0); + *local_spin += __hip_move_dpp(*local_spin, 0x111, 0xf, 0xf, 0); + *local_max = (max_depth > *local_max) ? max_depth : *local_max; + } + + if(WF_SIZE > 2) + { + // row_shr = 2 + max_depth = __hip_move_dpp(*local_max, 0x112, 0xf, 0xf, 0); + *local_spin += __hip_move_dpp(*local_spin, 0x112, 0xf, 0xf, 0); + *local_max = (max_depth > *local_max) ? max_depth : *local_max; + } + + if(WF_SIZE > 4) + { + // row_shr = 4 ; bank_mask = 0xe + max_depth = __hip_move_dpp(*local_max, 0x114, 0xf, 0xe, 0); + *local_spin += __hip_move_dpp(*local_spin, 0x114, 0xf, 0xe, 0); + *local_max = (max_depth > *local_max) ? max_depth : *local_max; + } + + if(WF_SIZE > 8) + { + // row_shr = 8 ; bank_mask = 0xc + max_depth = __hip_move_dpp(*local_max, 0x118, 0xf, 0xc, 0); + *local_spin += __hip_move_dpp(*local_spin, 0x118, 0xf, 0xc, 0); + *local_max = (max_depth > *local_max) ? max_depth : *local_max; + } + + if(WF_SIZE > 16) + { + // row_bcast = 15 ; row_mask = 0xa + max_depth = __hip_move_dpp(*local_max, 0x142, 0xa, 0xf, 0); + *local_spin += __hip_move_dpp(*local_spin, 0x142, 0xa, 0xf, 0); + *local_max = (max_depth > *local_max) ? max_depth : *local_max; + } + + if(WF_SIZE > 32) + { + // row_bcast = 31 ; row_mask = 0xc + max_depth = __hip_move_dpp(*local_max, 0x143, 0xc, 0xf, 0); + *local_spin += __hip_move_dpp(*local_spin, 0x143, 0xc, 0xf, 0); + *local_max = (max_depth > *local_max) ? max_depth : *local_max; + } +#elif defined(__HIP_PLATFORM_NVCC__) + for(int i = WF_SIZE >> 1; i >= 1; i >>= 1) + { + *local_max = max(*local_max, __shfl_down_sync(0xffffffff, *local_max, i)); + *local_spin += __shfl_down_sync(0xffffffff, *local_spin, i); + } +#endif +} + +template +__global__ void csrsv_analysis_kernel(rocsparse_int m, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + rocsparse_int* __restrict__ csr_diag_ind, + rocsparse_int* __restrict__ done_array, + rocsparse_int* __restrict__ rows_per_level, + rocsparse_int* __restrict__ max_depth, + unsigned long long* __restrict__ total_spin, + rocsparse_int* __restrict__ max_nnz, + rocsparse_index_base idx_base) +{ + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int lid = tid & (WF_SIZE - 1); + rocsparse_int row = gid / WF_SIZE; + + if(row >= m) + { + return; + } + + if(FILL_MODE == rocsparse_fill_mode_upper) + { + // Processing upper triangular matrix + row = m - 1 - row; + } + + // Initialize matrix diagonal index + if(lid == 0) + { + csr_diag_ind[row] = -1; + } + + rocsparse_int local_max = 0; + rocsparse_int local_spin = 0; + + int row_begin = csr_row_ptr[row] - idx_base; + int row_end = csr_row_ptr[row + 1] - idx_base; + + // This wavefront operates on a single row, from its beginning to end. + for(int j = row_begin + lid; j < row_end; j += WF_SIZE) + { + // local_col will tell us, for this iteration of the above for loop + // (i.e. for this entry in this row), which columns contain the + // non-zero values. We must then ensure that the output from the row + // associated with the local_col is complete to ensure that we can + // calculate the right answer. + int local_col = csr_col_ind[j] - idx_base; + + // Store diagonal index + if(local_col == row) + { + csr_diag_ind[row] = j; + } + + if(FILL_MODE == rocsparse_fill_mode_upper) + { + if(local_col <= row) + { + continue; + } + } + else if(FILL_MODE == rocsparse_fill_mode_lower) + { + // Diagonal and above, skip this. + if (local_col >= row) + { + break; + } + } + + int local_done = 0; + + // While there are threads in this workgroup that have been unable to + // get their input, loop and wait for the flag to exist. + while (!local_done) + { +#if defined(__HIP_PLATFORM_HCC__) + local_done = __atomic_load_n(&done_array[local_col], __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + local_done = atomicOr(&done_array[local_col], 0); +#endif + ++local_spin; + } + + local_max = max(local_done, local_max); + } + + // Determine maximum local depth and local spin loops + two_reduce(&local_max, &local_spin); + ++local_max; + + if (lid == WF_SIZE - 1) + { +#if defined(__HIP_PLATFORM_HCC__) + __atomic_store_n(&done_array[row], local_max, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + atomicOr(&done_array[row], local_max); +#endif + + // Must atomic these next three, since other WGs are doing the same thing + // We're sending out "local_max - 1" because of 0-based indexing. + // However, we needed to put a non-zero value into the done_array up above + // when we crammed local_depth in, so these two will be off by one. + atomicAdd(&rows_per_level[local_max-1], 1); + atomicMax(max_depth, local_max); + atomicAdd(total_spin, local_spin); + atomicMax(max_nnz, row_end - row_begin); + } +} + + + + + + +#if defined(__HIP_PLATFORM_HCC__) +// While HIP does not contain llvm intrinsics +__device__ int __llvm_amdgcn_readlane(int index, int offset) __asm("llvm.amdgcn.readlane"); + +static __device__ __inline__ float wf_reduce(float temp_sum) +{ + typedef union flt_b32 { + float val; + int b32; + } flt_b32_t; + flt_b32_t upper_sum, t_temp_sum; + + t_temp_sum.val = temp_sum; + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x80b1); + t_temp_sum.val += upper_sum.val; + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x804e); + t_temp_sum.val += upper_sum.val; + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x101f); + t_temp_sum.val += upper_sum.val; + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x201f); + t_temp_sum.val += upper_sum.val; + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x401f); + t_temp_sum.val += upper_sum.val; + upper_sum.b32 = __llvm_amdgcn_readlane(t_temp_sum.b32, 32); + t_temp_sum.val += upper_sum.val; + temp_sum = t_temp_sum.val; + + return temp_sum; +} + +static __device__ __inline__ double wf_reduce(double temp_sum) +{ + typedef union dbl_b32 { + double val; + int b32[2]; + } dbl_b32_t; + dbl_b32_t upper_sum, t_temp_sum; + + t_temp_sum.val = temp_sum; + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x80b1); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x80b1); + t_temp_sum.val += upper_sum.val; + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x804e); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x804e); + t_temp_sum.val += upper_sum.val; + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x101f); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x101f); + t_temp_sum.val += upper_sum.val; + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x201f); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x201f); + t_temp_sum.val += upper_sum.val; + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x401f); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x401f); + t_temp_sum.val += upper_sum.val; + upper_sum.b32[0] = __llvm_amdgcn_readlane(t_temp_sum.b32[0], 32); + upper_sum.b32[1] = __llvm_amdgcn_readlane(t_temp_sum.b32[1], 32); + t_temp_sum.val += upper_sum.val; + temp_sum = t_temp_sum.val; + + return temp_sum; +} +#elif defined(__HIP_PLATFORM_NVCC__) +template +static __device__ __inline__ T wf_reduce(T temp_sum) +{ + for(int i = 16; i >= 1; i >>= 1) + { + temp_sum += __shfl_down_sync(0xffffffff, temp_sum, i); + } + + return temp_sum; +} +#endif + +template +__device__ void csrsv_device(rocsparse_int m, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ done_array, + rocsparse_int* __restrict__ map, + rocsparse_int offset, + rocsparse_index_base idx_base, + rocsparse_fill_mode fill_mode, + rocsparse_diag_type diag_type) +{ + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * BLOCKSIZE + tid; + rocsparse_int lid = tid & (WF_SIZE - 1); + rocsparse_int wid = tid / WF_SIZE; + rocsparse_int idx = gid / WF_SIZE; + + __shared__ T diagonal[BLOCKSIZE / WF_SIZE]; + + if(idx >= m) + { + return; + } + + rocsparse_int row = map[idx + offset]; + rocsparse_int row_begin = csr_row_ptr[row] - idx_base; + rocsparse_int row_end = csr_row_ptr[row + 1] - idx_base; + + T local_sum = static_cast(0); + + if(lid == 0) + { + local_sum = alpha * x[row]; + } + + for(rocsparse_int j = row_begin + lid; j < row_end; j += WF_SIZE) + { + rocsparse_int local_col = csr_col_ind[j] - idx_base; + T local_val = csr_val[j]; + + if(fill_mode == rocsparse_fill_mode_upper) + { + // Processing upper triangular + if(local_col < row) + { + continue; + } + + if(local_col == row) + { + if(diag_type == rocsparse_diag_type_non_unit) + { + diagonal[wid] = static_cast(1) / local_val; + } + + continue; + } + } + else if(fill_mode == rocsparse_fill_mode_lower) + { + // Processing lower triangular + if(local_col > row) + { + break; + } + + if(local_col == row) + { + if(diag_type == rocsparse_diag_type_non_unit) + { + diagonal[wid] = static_cast(1) / local_val; + } + + break; + } + } + +#if defined(__HIP_PLATFORM_HCC__) + while(!__atomic_load_n(&done_array[local_col], __ATOMIC_RELAXED)); +#elif defined(__HIP_PLATFORM_NVCC__) + while(!atomicOr(&done_array[local_col], 0)); +#endif + +#if defined(__HIP_PLATFORM_HCC__) + T out_val; + __atomic_load(&y[local_col], &out_val, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + T out_val = y[local_col]; +#endif + + local_sum -= local_val * out_val; + } + + local_sum = wf_reduce(local_sum); + + if(diag_type == rocsparse_diag_type_non_unit) + { + local_sum *= diagonal[wid]; + } + + if (lid == 0) + { +#if defined(__HIP_PLATFORM_HCC__) + __atomic_store(&y[row], &local_sum, __ATOMIC_RELAXED); + __atomic_store_n(&done_array[row], 1, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + y[row] = local_sum; + atomicOr(&done_array[row], 1); +#endif + } +} + +#endif // CSRSV_DEVICE_H diff --git a/library/src/level2/rocsparse_csrsv.cpp b/library/src/level2/rocsparse_csrsv.cpp new file mode 100644 index 00000000..12a2421b --- /dev/null +++ b/library/src/level2/rocsparse_csrsv.cpp @@ -0,0 +1,343 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "rocsparse.h" +#include "rocsparse_csrsv.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_csrsv_buffer_size(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging + log_trace(handle, + "rocsparse_csrsv_buffer_size", + trans, + m, + nnz, + (const void*&)descr, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info, + (const void*&)buffer_size); + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(buffer_size == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || nnz == 0) + { + // Do not return 0 as buffer size + *buffer_size = 4; + return rocsparse_status_success; + } + + // rocsparse_int max depth + *buffer_size = 256; + + // unsigned long long total_spin + *buffer_size += 256; + + // rocsparse_int max_nnz + *buffer_size += 256; + + // rocsparse_int done_array[m] + *buffer_size += sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; + + // rocsparse_int rows_per_level[m] + *buffer_size += sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; + + size_t hipcub_size = 0; + rocsparse_int* ptr = nullptr; + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(nullptr, hipcub_size, ptr, ptr, m)); + + // hipcub buffer + *buffer_size += hipcub_size; + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy solve, + rocsparse_analysis_policy analysis, + void* temp_buffer) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging + log_trace(handle, + "rocsparse_csrsv_analysis", + trans, + m, + nnz, + (const void*&)descr, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info, + solve, + analysis, + (const void*&)temp_buffer); + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + + // Check matrix type + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check analysis policy + if(analysis != rocsparse_analysis_policy_reuse && analysis != rocsparse_analysis_policy_force) + { + return rocsparse_status_invalid_value; + } + + // Check solve policy + if(solve != rocsparse_solve_policy_auto) + { + return rocsparse_status_invalid_value; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(temp_buffer == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + + // Switch between lower and upper triangular analysis + if(descr->fill_mode == rocsparse_fill_mode_upper) + { + // This is currently the only case where we need upper triangular analysis, + // therefore we ignore the analysis policy + + // Clear csrsv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_upper_info)); + + // Create csrsv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrsv_upper_info)); + + // Perform analysis + RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, + trans, + m, + nnz, + descr, + csr_row_ptr, + csr_col_ind, + info->csrsv_upper_info, + solve, + analysis, + temp_buffer)); + } + else + { + // Differentiate the analysis policies + if(analysis == rocsparse_analysis_policy_reuse) + { + // We try to re-use already analyzed lower part, if available. + // It is the user's responsibility that this data is still valid, + // since he passed the 'reuse' flag. + + // If csrsv meta data is already available, do nothing + if(info->csrsv_lower_info != nullptr) + { + return rocsparse_status_success; + } + + // Check for other lower analysis meta data + rocsparse_csrtr_info reuse = nullptr; + + // ILU0 meta data + if(info->csrilu0_info != nullptr) + { + reuse = info->csrilu0_info; + } + // TODO add more crossover data here + + + + // If data has been found, use it + if(reuse != nullptr) + { + info->csrsv_lower_info = reuse; + + return rocsparse_status_success; + } + } + + // User is explicitly asking to force a re-analysis, or no valid data has been + // found to be re-used. + + // Clear csrsv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_lower_info)); + + // Create csrsv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrsv_lower_info)); + + // Perform analysis + RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, + trans, + m, + nnz, + descr, + csr_row_ptr, + csr_col_ind, + info->csrsv_lower_info, + solve, + analysis, + temp_buffer)); + } + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_csrsv_clear(const rocsparse_mat_descr descr, + rocsparse_mat_info info) +{ + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_scsrsv_solve(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ind, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + const float* x, + float* y, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + return rocsparse_csrsv_solve_template( + handle, trans, m, nnz, alpha, descr, csr_val, csr_row_ind, csr_col_ind, info, x, y, policy, temp_buffer); +} + +extern "C" rocsparse_status rocsparse_dcsrsv_solve(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ind, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + const double* x, + double* y, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + return rocsparse_csrsv_solve_template( + handle, trans, m, nnz, alpha, descr, csr_val, csr_row_ind, csr_col_ind, info, x, y, policy, temp_buffer); +} diff --git a/library/src/level2/rocsparse_csrsv.hpp b/library/src/level2/rocsparse_csrsv.hpp new file mode 100644 index 00000000..9dba8d07 --- /dev/null +++ b/library/src/level2/rocsparse_csrsv.hpp @@ -0,0 +1,505 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_CSRSV_HPP +#define ROCSPARSE_CSRSV_HPP + +#include "rocsparse.h" +#include "definitions.h" +#include "handle.h" +#include "utility.h" +#include "csrsv_device.h" + +#include +#include + +static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_csrtr_info info, + rocsparse_solve_policy solve, + rocsparse_analysis_policy analysis, + void* temp_buffer) +{ + // Stream + hipStream_t stream = handle->stream; + + // Buffer + char* ptr = reinterpret_cast(temp_buffer); + + // Initialize temporary buffer + size_t buffer_size = 256 + + 256 + + 256 + + sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256 + + sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; + + RETURN_IF_HIP_ERROR(hipMemset(ptr, 0, sizeof(char) * buffer_size)); + + // max_depth + rocsparse_int* d_max_depth = reinterpret_cast(ptr); + ptr += 256; + + // total_spin + unsigned long long* d_total_spin = reinterpret_cast(ptr); + ptr += 256; + + // max_nnz + rocsparse_int* d_max_nnz = reinterpret_cast(ptr); + ptr += 256; + + // done array + rocsparse_int* d_done_array = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; + + // rows_per_level + rocsparse_int* d_rows_per_level = reinterpret_cast(ptr); + ptr += sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; + + // hipcub buffer + void* hipcub_buffer = reinterpret_cast(ptr); + + // Allocate buffer to hold diagonal entry point + RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->csr_diag_ind, sizeof(rocsparse_int) * m)); + + + + + // Run analysis +#define CSRILU0_DIM 1024 + dim3 csrsv_blocks((handle->wavefront_size * m - 1) / CSRILU0_DIM + 1); + dim3 csrsv_threads(CSRILU0_DIM); +#undef CSRILU0_DIM + + if(handle->wavefront_size == 32) + { + if(descr->fill_mode == rocsparse_fill_mode_upper) + { + hipLaunchKernelGGL((csrsv_analysis_kernel<32, rocsparse_fill_mode_upper>), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + info->csr_diag_ind, + d_done_array, + d_rows_per_level, + d_max_depth, + d_total_spin, + d_max_nnz, + descr->base); + } + else if(descr->fill_mode == rocsparse_fill_mode_lower) + { + hipLaunchKernelGGL((csrsv_analysis_kernel<32, rocsparse_fill_mode_lower>), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + info->csr_diag_ind, + d_done_array, + d_rows_per_level, + d_max_depth, + d_total_spin, + d_max_nnz, + descr->base); + } + } + else if(handle->wavefront_size == 64) + { + if(descr->fill_mode == rocsparse_fill_mode_upper) + { + hipLaunchKernelGGL((csrsv_analysis_kernel<64, rocsparse_fill_mode_upper>), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + info->csr_diag_ind, + d_done_array, + d_rows_per_level, + d_max_depth, + d_total_spin, + d_max_nnz, + descr->base); + } + else if(descr->fill_mode == rocsparse_fill_mode_lower) + { + hipLaunchKernelGGL((csrsv_analysis_kernel<64, rocsparse_fill_mode_lower>), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + info->csr_diag_ind, + d_done_array, + d_rows_per_level, + d_max_depth, + d_total_spin, + d_max_nnz, + descr->base); + } + } + else + { + return rocsparse_status_arch_mismatch; + } + + // Post processing + RETURN_IF_HIP_ERROR(hipMemcpy(&info->max_depth, d_max_depth, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR(hipMemcpy(&info->total_spin, d_total_spin, sizeof(unsigned long long), hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR(hipMemcpy(&info->max_nnz, d_max_nnz, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + // Inclusive sum to obtain rows per level + size_t hipcub_size = 0; + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(nullptr, hipcub_size, d_rows_per_level, d_rows_per_level, info->max_depth)); + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(hipcub_buffer, hipcub_size, d_rows_per_level, d_rows_per_level, info->max_depth)); + + // Allocate host memory for meta data + info->rows_per_level.resize(info->max_depth); + std::vector done_array(m); + + // Move meta data to host (required for kernel launching) + RETURN_IF_HIP_ERROR(hipMemcpy(info->rows_per_level.data(), d_rows_per_level, sizeof(rocsparse_int) * info->max_depth, hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR(hipMemcpy(done_array.data(), d_done_array, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); + + std::vector row_map(m + 1, 0); + std::vector counter(info->max_depth, 0); + + // Create row map + for(rocsparse_int i = 0; i < m; ++i) + { + rocsparse_int level = done_array[i] - 1; + rocsparse_int prev_level = level - 1; + rocsparse_int depth_offset = (level == 0) ? 0 : info->rows_per_level[prev_level]; + + row_map[depth_offset + counter[level]] = i; + ++counter[level]; + } + + // Copy row map to device + RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->row_map, sizeof(rocsparse_int) * (m + 1))); + RETURN_IF_HIP_ERROR(hipMemcpy(info->row_map, row_map.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + + + + + + + + + + + + + // Store some pointers to verify correct execution + info->m = m; + info->nnz = nnz; + info->descr = descr; + info->csr_row_ptr = csr_row_ptr; + info->csr_col_ind = csr_col_ind; + + return rocsparse_status_success; +} + +template +__launch_bounds__(BLOCKSIZE) +__global__ void csrsv_host_pointer(rocsparse_int m, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ done_array, + rocsparse_int* __restrict__ map, + rocsparse_int offset, + rocsparse_index_base idx_base, + rocsparse_fill_mode fill_mode, + rocsparse_diag_type diag_type) +{ + csrsv_device(m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + done_array, + map, + offset, + idx_base, + fill_mode, + diag_type); +} + +template +__launch_bounds__(BLOCKSIZE) +__global__ void csrsv_device_pointer(rocsparse_int m, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ done_array, + rocsparse_int* __restrict__ map, + rocsparse_int offset, + rocsparse_index_base idx_base, + rocsparse_fill_mode fill_mode, + rocsparse_diag_type diag_type) +{ + if(*alpha == static_cast(0)) + { + return; + } + + csrsv_device(m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + done_array, + map, + offset, + idx_base, + fill_mode, + diag_type); +} + +template +rocsparse_status rocsparse_csrsv_solve_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + const T* x, + T* y, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging + if(handle->pointer_mode == rocsparse_pointer_mode_host) + { + log_trace(handle, + replaceX("rocsparse_Xcsrsv"), + trans, + m, + nnz, + *alpha, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info, + (const void*&)x, + (const void*&)y, + policy, + (const void*&)temp_buffer); + + log_bench(handle, + "./rocsparse-bench -f csrsv -r", + replaceX("X"), + "--mtx ", + "--alpha", + *alpha); + } + else + { + log_trace(handle, + replaceX("rocsparse_Xcsrsv"), + trans, + m, + nnz, + (const void*&)alpha, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info, + (const void*&)x, + (const void*&)y, + policy, + (const void*&)temp_buffer); + } + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(alpha == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(x == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(y == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(temp_buffer == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + + + + + + + // Buffer + char* ptr = reinterpret_cast(temp_buffer); + + // done array + rocsparse_int* d_done_array = reinterpret_cast(ptr); + + // Initialize buffers + RETURN_IF_HIP_ERROR(hipMemset(d_done_array, 0, sizeof(rocsparse_int) * m)); + + + + + + rocsparse_csrtr_info csrsv = (descr->fill_mode == rocsparse_fill_mode_upper) ? + info->csrsv_upper_info : + info->csrsv_lower_info; + + + + + +#define CSRSV_DIM 1024 + dim3 csrsv_blocks((handle->wavefront_size * m - 1) / CSRSV_DIM + 1); + dim3 csrsv_threads(CSRSV_DIM); + +// TODO host dev ptr + if(handle->wavefront_size == 32) + { + hipLaunchKernelGGL((csrsv_host_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + 0, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else if(handle->wavefront_size == 64) + { + hipLaunchKernelGGL((csrsv_host_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + 0, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else + { + return rocsparse_status_arch_mismatch; + } +#undef CSRSV_DIM + + return rocsparse_status_success; +} + +#endif // ROCSPARSE_CSRSV_HPP diff --git a/library/src/precond/csrilu0_device.h b/library/src/precond/csrilu0_device.h new file mode 100644 index 00000000..5f6c9363 --- /dev/null +++ b/library/src/precond/csrilu0_device.h @@ -0,0 +1,323 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef CSRILU0_DEVICE_H +#define CSRILU0_DEVICE_H + +#include + +template +__global__ void csrilu0_hash_kernel(rocsparse_int m, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + T* __restrict__ csr_val, + const rocsparse_int* __restrict__ csr_diag_ind, + rocsparse_int* __restrict__ done, + rocsparse_int* __restrict__ map, + rocsparse_int* __restrict__ zero_pivot, + rocsparse_index_base idx_base) +{ + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int lid = tid & (WF_SIZE - 1); + rocsparse_int idx = gid / WF_SIZE; + +int wid = tid / WF_SIZE; + + __shared__ rocsparse_int stable[BLOCKSIZE / WF_SIZE][WF_SIZE * HASH];//[BLOCKSIZE * HASH]; + __shared__ rocsparse_int sdata[BLOCKSIZE / WF_SIZE][WF_SIZE * HASH]; + + for(rocsparse_int j = 0; j < HASH; ++j) + { + stable[wid][lid + j * WF_SIZE] = -1; + } + + if (idx >= m) + { + return; + } + + rocsparse_int row = map[idx]; + rocsparse_int row_diag = csr_diag_ind[row]; + + // Row has structural zero diagonal, skip + if(row_diag == -1) + { + if(lid == 0) + { + atomicMin(zero_pivot, row); +#if defined(__HIP_PLATFORM_HCC__) + __atomic_store_n(&done[row], 1, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + atomicOr(&done[row], 1); +#endif + } + + return; + } + + rocsparse_int row_begin = csr_row_ptr[row] - idx_base; + rocsparse_int row_end = csr_row_ptr[row + 1] - idx_base; + + // Fill hash table +// rocsparse_int* table = &stable[(tid / WF_SIZE) * WF_SIZE * HASH]; +// rocsparse_int* data = &sdata[(tid / WF_SIZE) * WF_SIZE * HASH]; + rocsparse_int* table = stable[wid]; + rocsparse_int* data = sdata[wid]; + + for(rocsparse_int j = row_begin + lid; j < row_end; j += WF_SIZE) + { + // Insert key into hash table + int key = csr_col_ind[j]; + int hash = (key * 103) & (WF_SIZE * HASH - 1); + + while(true) + { + if(table[hash] == key) + { + break; + } + else if(atomicCAS(&table[hash], -1, key) == -1) + { + data[hash] = j; + break; + } + else + { + hash = (hash + 1) & (WF_SIZE * HASH - 1); + } + } + } + + for(rocsparse_int j = row_begin; j < row_diag; ++j) + { + rocsparse_int local_col = csr_col_ind[j] - idx_base; + T local_val = csr_val[j]; + rocsparse_int local_end = csr_row_ptr[local_col + 1] - idx_base; + rocsparse_int local_diag = csr_diag_ind[local_col]; + + // Row depends on structural zero diagonal + if(local_diag == -1) + { + if(lid == 0) + { + atomicMin(zero_pivot, local_col); + } + + break; + } + + rocsparse_int local_done = 0; + while(!local_done) + { +#if defined(__HIP_PLATFORM_HCC__) + local_done = __atomic_load_n(&done[local_col], __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + local_done = atomicOr(&done[local_col], 0x0); +#endif + } + +#if defined(__HIP_PLATFORM_HCC__) + T diag_val; + __atomic_load(&csr_val[local_diag], &diag_val, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + T diag_val = csr_val[local_diag]; +#endif + + // Row has numerical zero diagonal + if(diag_val == 0.0) + { + if(lid == 0) + { + atomicMin(zero_pivot, local_col); + } + + break; + } + + csr_val[j] = local_val /= diag_val; + + for(rocsparse_int k = local_diag + 1 + lid; k < local_end; k += WF_SIZE) + { + // Get value from hash table + int key = csr_col_ind[k]; + int hash = (key * 103) & (WF_SIZE * HASH - 1); + + while(true) + { + int val = table[hash]; + + if(val == -1) + { + break; + } + else if(val == key) + { +#if defined(__HIP_PLATFORM_HCC__) + T val_k; + __atomic_load(&csr_val[k], &val_k, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + T val_k = csr_val[k]; +#endif + csr_val[data[hash]] -= local_val * val_k; + break; + } + + hash = (hash + 1) & (WF_SIZE * HASH - 1); + } + } + } + + if(lid == 0) + { +#if defined(__HIP_PLATFORM_HCC__) + __atomic_store_n(&done[row], 1, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + atomicOr(&done[row], 1); +#endif + } +} + +template +__global__ void csrilu0_binsearch_kernel(rocsparse_int m, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + T* __restrict__ csr_val, + const rocsparse_int* __restrict__ csr_diag_ind, + rocsparse_int* __restrict__ done, + rocsparse_int* __restrict__ map, + rocsparse_int* __restrict__ zero_pivot, + rocsparse_index_base idx_base) +{ + rocsparse_int tid = hipThreadIdx_x; + rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + tid; + rocsparse_int lid = tid & (WF_SIZE - 1); + rocsparse_int idx = gid / WF_SIZE; + + if (idx >= m) + { + return; + } + + rocsparse_int row = map[idx]; + rocsparse_int row_diag = csr_diag_ind[row]; + + // Row has structural zero diagonal, skip + if(row_diag == -1) + { + if(lid == 0) + { + atomicMin(zero_pivot, row); +#if defined(__HIP_PLATFORM_HCC__) + __atomic_store_n(&done[row], 1, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + atomicOr(&done[row], 1); +#endif + } + + return; + } + + rocsparse_int row_begin = csr_row_ptr[row] - idx_base; + rocsparse_int row_end = csr_row_ptr[row + 1] - idx_base; + + for(rocsparse_int j = row_begin; j < row_diag; ++j) + { + rocsparse_int local_col = csr_col_ind[j] - idx_base; + T local_val = csr_val[j]; + rocsparse_int local_end = csr_row_ptr[local_col + 1] - idx_base; + rocsparse_int local_diag = csr_diag_ind[local_col]; + + // Row depends on structural zero diagonal + if(local_diag == -1) + { + if(lid == 0) + { + atomicMin(zero_pivot, local_col); + } + + break; + } + + rocsparse_int local_done = 0; + while(!local_done) + { +#if defined(__HIP_PLATFORM_HCC__) + local_done = __atomic_load_n(&done[local_col], __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + local_done = atomicOr(&done[local_col], 0x0); +#endif + } + +#if defined(__HIP_PLATFORM_HCC__) + T diag_val; + __atomic_load(&csr_val[local_diag], &diag_val, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + // TODO + volatile T diag_val = csr_val[local_diag]; +#endif + + // Row has numerical zero diagonal + if(diag_val == 0.0) + { + if(lid == 0) + { + atomicMin(zero_pivot, local_col); + } + + break; + } + + csr_val[j] = local_val /= diag_val; + + rocsparse_int l = j + 1; + for(rocsparse_int k = local_diag + 1 + lid; k < local_end; k += WF_SIZE) + { + rocsparse_int r = row_end - 1; + rocsparse_int m = (r + l) >> 1; + rocsparse_int col_j = csr_col_ind[m]; + + rocsparse_int col_k = csr_col_ind[k]; + + while(l < r) + { + if(col_j < col_k) + { + l = m + 1; + } + else + { + r = m; + } + + m = (r + l) >> 1; + col_j = csr_col_ind[m]; + } + + if(col_j == col_k) + { +#if defined(__HIP_PLATFORM_HCC__) + T val_k; + __atomic_load(&csr_val[k], &val_k, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + volatile T val_k = csr_val[k]; +#endif + + csr_val[l] -= local_val * val_k; + } + } + } + + if(lid == 0) + { +#if defined(__HIP_PLATFORM_HCC__) + __atomic_store_n(&done[row], 1, __ATOMIC_RELAXED); +#elif defined(__HIP_PLATFORM_NVCC__) + atomicOr(&done[row], 1); +#endif + } +} + +#endif // CSRMV_DEVICE_H diff --git a/library/src/precond/rocsparse_csrilu0.cpp b/library/src/precond/rocsparse_csrilu0.cpp new file mode 100644 index 00000000..75339682 --- /dev/null +++ b/library/src/precond/rocsparse_csrilu0.cpp @@ -0,0 +1,211 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "definitions.h" +#include "rocsparse.h" +#include "rocsparse_csrilu0.hpp" + +#include "../level2/rocsparse_csrsv.hpp" + +#include + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" rocsparse_status rocsparse_csrilu0_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + return rocsparse_csrsv_buffer_size(handle, + rocsparse_operation_none, + m, + nnz, + descr, + csr_row_ptr, + csr_col_ind, + info, + buffer_size); +} + +extern "C" rocsparse_status rocsparse_csrilu0_analysis(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy solve, + rocsparse_analysis_policy analysis, + void* temp_buffer) +{ + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging + log_trace(handle, + "rocsparse_csrilu0_analysis", + m, + nnz, + (const void*&)descr, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info, + solve, + analysis); + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check analysis policy + if(analysis != rocsparse_analysis_policy_reuse && analysis != rocsparse_analysis_policy_force) + { + return rocsparse_status_invalid_value; + } + + // Check solve policy + if(solve != rocsparse_solve_policy_auto) + { + return rocsparse_status_invalid_value; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(temp_buffer == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Clear csrilu0 info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrilu0_info)); + + // Create csrilu0 info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrilu0_info)); + + // Call analysis routine (shared with csrsv and csric0) + RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, + rocsparse_operation_none, + m, + nnz, + descr, + csr_row_ptr, + csr_col_ind, + info->csrilu0_info, + solve, + analysis, + temp_buffer)); + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_csrilu0_clear(rocsparse_mat_info info) +{ + // If meta data is shared, do not delete anything + if(info->csrilu0_info == info->csrsv_lower_info) + { + info->csrilu0_info = nullptr; + + return rocsparse_status_success; + } + + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrilu0_info)); + info->csrilu0_info = nullptr; + + return rocsparse_status_success; +} + +extern "C" rocsparse_status rocsparse_scsrilu0(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + return rocsparse_csrilu0_template(handle, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + policy, + temp_buffer); +} + +extern "C" rocsparse_status rocsparse_dcsrilu0(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + return rocsparse_csrilu0_template(handle, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + policy, + temp_buffer); +} diff --git a/library/src/precond/rocsparse_csrilu0.hpp b/library/src/precond/rocsparse_csrilu0.hpp new file mode 100644 index 00000000..54c4bebb --- /dev/null +++ b/library/src/precond/rocsparse_csrilu0.hpp @@ -0,0 +1,266 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef ROCSPARSE_CSRILU0_HPP +#define ROCSPARSE_CSRILU0_HPP + +#include "rocsparse.h" +#include "utility.h" +#include "csrilu0_device.h" + +#include + +template +rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging + log_trace(handle, + replaceX("rocsparse_Xcsrilu0"), + m, + nnz, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info, + policy, + (const void*&)temp_buffer); + + log_bench(handle, + "./rocsparse-bench -f csrilu0 -r", + replaceX("X"), + "--mtx "); + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(temp_buffer == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Stream + hipStream_t stream = handle->stream; + + + + // Buffer + char* ptr = reinterpret_cast(temp_buffer); + + // zero pivot + rocsparse_int* d_zero_pivot = reinterpret_cast(ptr); + ptr += 256; + + // done array + rocsparse_int* d_done_array = reinterpret_cast(ptr); + + // Initialize buffers + RETURN_IF_HIP_ERROR(hipMemcpy(d_zero_pivot, &m, sizeof(rocsparse_int), hipMemcpyHostToDevice)); + RETURN_IF_HIP_ERROR(hipMemset(d_done_array, 0, sizeof(rocsparse_int) * m)); + +#define CSRILU0_DIM 256 + dim3 csrilu0_blocks((m * handle->wavefront_size - 1) / CSRILU0_DIM + 1); + dim3 csrilu0_threads(CSRILU0_DIM); + + if(handle->wavefront_size == 32) + { + hipLaunchKernelGGL((csrilu0_binsearch_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + d_zero_pivot, + descr->base); + } + else if(handle->wavefront_size == 64) + { + if(info->csrilu0_info->max_nnz <= 64) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + d_zero_pivot, + descr->base); + } + else if(info->csrilu0_info->max_nnz <= 128) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + d_zero_pivot, + descr->base); + } + else if(info->csrilu0_info->max_nnz <= 256) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + d_zero_pivot, + descr->base); + } + else if(info->csrilu0_info->max_nnz <= 512) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + d_zero_pivot, + descr->base); + } + else if(info->csrilu0_info->max_nnz <= 1024) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + d_zero_pivot, + descr->base); + } + else + { + printf("standard kernel\n"); + hipLaunchKernelGGL((csrilu0_binsearch_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + d_zero_pivot, + descr->base); + } + } + else + { + return rocsparse_status_arch_mismatch; + } +#undef CSRILU0_DIM +/* +// TODO this is blocking somehow + int zero; + hipMemcpyAsync(&zero, d_zero_pivot, sizeof(int), hipMemcpyDeviceToHost, stream); + + if(zero != m) + printf("Zero pivot: %d\n", zero); +*/ + return rocsparse_status_success; +} + +#endif // ROCSPARSE_CSRILU0_HPP From 852f8345e24a343f4d82864c54fe2ef7587c0c3b Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Wed, 5 Sep 2018 13:01:27 +0200 Subject: [PATCH 236/304] csrsv_clear filled ; csrilu0 now finds shared meta data too --- library/src/level2/rocsparse_csrsv.cpp | 28 ++++++++++++--- library/src/level2/rocsparse_csrsv.hpp | 2 -- library/src/precond/rocsparse_csrilu0.cpp | 43 +++++++++++++++++++++-- 3 files changed, 63 insertions(+), 10 deletions(-) diff --git a/library/src/level2/rocsparse_csrsv.cpp b/library/src/level2/rocsparse_csrsv.cpp index 12a2421b..97ee8c97 100644 --- a/library/src/level2/rocsparse_csrsv.cpp +++ b/library/src/level2/rocsparse_csrsv.cpp @@ -232,8 +232,6 @@ extern "C" rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, csr_row_ptr, csr_col_ind, info->csrsv_upper_info, - solve, - analysis, temp_buffer)); } else @@ -254,11 +252,12 @@ extern "C" rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, // Check for other lower analysis meta data rocsparse_csrtr_info reuse = nullptr; - // ILU0 meta data + // csrilu0 meta data if(info->csrilu0_info != nullptr) { reuse = info->csrilu0_info; } + // TODO add more crossover data here @@ -290,8 +289,6 @@ extern "C" rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, csr_row_ptr, csr_col_ind, info->csrsv_lower_info, - solve, - analysis, temp_buffer)); } @@ -301,6 +298,27 @@ extern "C" rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, extern "C" rocsparse_status rocsparse_csrsv_clear(const rocsparse_mat_descr descr, rocsparse_mat_info info) { + // Determine which info meta data should be deleted + if(descr->fill_mode == rocsparse_fill_mode_lower) + { + // If meta data is shared, do not delete anything + if(info->csrilu0_info == info->csrsv_lower_info) + { + info->csrsv_lower_info = nullptr; + + return rocsparse_status_success; + } + + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_lower_info)); + info->csrsv_lower_info = nullptr; + } + else if(descr->fill_mode == rocsparse_fill_mode_upper) + { + // Upper info has no shares (yet) + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_upper_info)); + info->csrsv_upper_info = nullptr; + } + return rocsparse_status_success; } diff --git a/library/src/level2/rocsparse_csrsv.hpp b/library/src/level2/rocsparse_csrsv.hpp index 9dba8d07..812a24c6 100644 --- a/library/src/level2/rocsparse_csrsv.hpp +++ b/library/src/level2/rocsparse_csrsv.hpp @@ -23,8 +23,6 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, const rocsparse_int* csr_row_ptr, const rocsparse_int* csr_col_ind, rocsparse_csrtr_info info, - rocsparse_solve_policy solve, - rocsparse_analysis_policy analysis, void* temp_buffer) { // Stream diff --git a/library/src/precond/rocsparse_csrilu0.cpp b/library/src/precond/rocsparse_csrilu0.cpp index 75339682..29d75d93 100644 --- a/library/src/precond/rocsparse_csrilu0.cpp +++ b/library/src/precond/rocsparse_csrilu0.cpp @@ -126,13 +126,52 @@ extern "C" rocsparse_status rocsparse_csrilu0_analysis(rocsparse_handle handle, return rocsparse_status_success; } + // Differentiate the analysis policies + if(analysis == rocsparse_analysis_policy_reuse) + { + // We try to re-use already analyzed lower part, if available. + // It is the user's responsibility that this data is still valid, + // since he passed the 'reuse' flag. + + // If csrilu0 meta data is already available, do nothing + if(info->csrilu0_info != nullptr) + { + return rocsparse_status_success; + } + + // Check for other lower analysis meta data + rocsparse_csrtr_info reuse = nullptr; + + // csrsv_lower meta data + if(info->csrsv_lower_info != nullptr) + { + reuse = info->csrsv_lower_info; + } + + // TODO add more crossover data here + + + + + // If data has been found, use it + if(reuse != nullptr) + { + info->csrilu0_info = reuse; + + return rocsparse_status_success; + } + } + + // User is explicitly asking to force a re-analysis, or no valid data has been + // found to be re-used. + // Clear csrilu0 info RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrilu0_info)); // Create csrilu0 info RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrilu0_info)); - // Call analysis routine (shared with csrsv and csric0) + // Perform analysis RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, rocsparse_operation_none, m, @@ -141,8 +180,6 @@ extern "C" rocsparse_status rocsparse_csrilu0_analysis(rocsparse_handle handle, csr_row_ptr, csr_col_ind, info->csrilu0_info, - solve, - analysis, temp_buffer)); return rocsparse_status_success; From 545870ad812ed7909017e848a157332774e9a39e Mon Sep 17 00:00:00 2001 From: Nico Trost Date: Thu, 6 Sep 2018 09:54:41 +0200 Subject: [PATCH 237/304] initial documentation --- docs/Doxyfile | 2459 +++++++++++++++++++++++++++++ docs/rocm.jpg | Bin 0 -> 6761 bytes docs/run_doc.sh | 19 + docs/run_doxygen.sh | 13 + docs/source/Makefile | 20 + docs/source/allapi.rst | 9 + docs/source/api.rst | 182 +++ docs/source/conf.py | 185 +++ docs/source/index.rst | 21 + docs/source/library.rst | 2887 ++++++++++++++++++++++++++++++++++ docs/source/requirements.txt | 3 + 11 files changed, 5798 insertions(+) create mode 100644 docs/Doxyfile create mode 100644 docs/rocm.jpg create mode 100755 docs/run_doc.sh create mode 100755 docs/run_doxygen.sh create mode 100644 docs/source/Makefile create mode 100644 docs/source/allapi.rst create mode 100644 docs/source/api.rst create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.rst create mode 100644 docs/source/library.rst create mode 100644 docs/source/requirements.txt diff --git a/docs/Doxyfile b/docs/Doxyfile new file mode 100644 index 00000000..95c1a77c --- /dev/null +++ b/docs/Doxyfile @@ -0,0 +1,2459 @@ +# Doxyfile 1.8.10 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "rocSPARSE" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = v3.0.1.0 + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HiP" + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = ./rocm.jpg + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = docBin + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = YES + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = YES + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +SHOW_NAMESPACES = NO + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = rocsparse-functions.h \ + rocsparse-auxiliary.h \ + rocsparse-types.h \ + ../ + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf, *.as and *.js. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.f90 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.as \ + *.js + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = ../README.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# compiled with the --with-libclang option. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 1 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = YES + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /