Skip to content

Commit

Permalink
refactor(ttm): add namespace ttm
Browse files Browse the repository at this point in the history
  • Loading branch information
bassoy committed Nov 1, 2024
1 parent c7c4cd4 commit fb3c0fa
Show file tree
Hide file tree
Showing 20 changed files with 600 additions and 637 deletions.
7 changes: 4 additions & 3 deletions example/interface1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
#include <numeric>
#include <iostream>

using namespace tlib::ttm;

int main()
{
using value_t = float;
using tensor_t = tlib::tensor<value_t>; // or std::array<value_t,N>
using tensor_t = tensor<value_t>; // or std::array<value_t,N>
using shape_t = typename tensor_t::shape_t;

// shape tuple for A
Expand All @@ -25,8 +26,8 @@ int main()
auto pb = nb.size();

// layout tuple for A and C
auto pia = tlib::detail::generate_k_order_layout(pa,1ul);
auto pib = tlib::detail::generate_k_order_layout(pb,1ul);
auto pia = detail::generate_k_order_layout(pa,1ul);
auto pib = detail::generate_k_order_layout(pb,1ul);

auto A = tensor_t( na, pia );
auto B = tensor_t( nb, pib );
Expand Down
15 changes: 8 additions & 7 deletions example/interface2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
#include <numeric>
#include <iostream>

using namespace tlib::ttm;

int main()
{
using value_t = float;
using tensor_t = tlib::tensor<value_t>; // or std::array<value_t,N>
using tensor_t = tensor<value_t>; // or std::array<value_t,N>
using shape_t = typename tensor_t::shape_t;

// shape tuple for A
Expand All @@ -25,8 +26,8 @@ int main()
auto pb = nb.size();

// layout tuple for A and C
auto pia = tlib::detail::generate_k_order_layout(pa,1ul);
auto pib = tlib::detail::generate_k_order_layout(pb,1ul);
auto pia = detail::generate_k_order_layout(pa,1ul);
auto pib = detail::generate_k_order_layout(pb,1ul);

auto A = tensor_t( na, pia );
auto B = tensor_t( nb, pib );
Expand Down Expand Up @@ -55,10 +56,10 @@ int main()


// correct shape, layout and strides of the output tensors C1,C2 are automatically computed and returned by the functions.
auto C1 = tlib::ttm(q, A,B, tlib::parallel_policy::parallel_blas , tlib::slicing_policy::slice, tlib::fusion_policy::none );
auto C2 = tlib::ttm(q, A,B, tlib::parallel_policy::parallel_loop , tlib::slicing_policy::slice, tlib::fusion_policy::all );
auto C3 = tlib::ttm(q, A,B, tlib::parallel_policy::parallel_loop , tlib::slicing_policy::subtensor, tlib::fusion_policy::all );
auto C4 = tlib::ttm(q, A,B, tlib::parallel_policy::batched_gemm , tlib::slicing_policy::subtensor, tlib::fusion_policy::all );
auto C1 = ttm(q, A,B, parallel_policy::parallel_blas , slicing_policy::slice, fusion_policy::none );
auto C2 = ttm(q, A,B, parallel_policy::parallel_loop , slicing_policy::slice, fusion_policy::all );
auto C3 = ttm(q, A,B, parallel_policy::parallel_loop , slicing_policy::subtensor, fusion_policy::all );
auto C4 = ttm(q, A,B, parallel_policy::batched_gemm , slicing_policy::subtensor, fusion_policy::all );


std::cout << "C1 = " << C1 << std::endl;
Expand Down
19 changes: 10 additions & 9 deletions example/interface3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,24 @@
#include <numeric>
#include <iostream>

using namespace tlib::ttm;

int main()
{
using value_t = float;
using size_t = std::size_t;
using tensor_t = std::vector<value_t>; // or std::array<value_t,N>
using shape_t = std::vector<size_t>;
using shape_t = std::vector<size_t>;
using iterator_t = std::ostream_iterator<value_t>;

auto na = shape_t{4,3,2}; // input shape tuple
auto p = na.size(); // order of input tensor, i.e. number of dimensions - here 3
auto k = 1ul; // k-order of input tensor
auto q = 2ul;

auto pia = tlib::detail::generate_k_order_layout(p,k); // layout tuple of input tensor - here {1,2,3};
auto wa = tlib::detail::generate_strides(na,pia); // stride tuple of input tensor - here {1,4,12};
auto nna = std::accumulate(na.begin(),na.end(),1ul,std::multiplies<>()); // number of elements of input tensor
auto pia = detail::generate_k_order_layout(p,k); // layout tuple of input tensor - here {1,2,3};
auto wa = detail::generate_strides(na,pia); // stride tuple of input tensor - here {1,4,12};
auto nna = std::accumulate(na.begin(),na.end(),1ul,std::multiplies<>()); // number of elements of input tensor

auto pib = shape_t{1,2};
auto nb = shape_t{na[q-1]+1,na[q-1]};
Expand All @@ -29,7 +30,7 @@ int main()
auto nc = na;
nc[q-1] = nb[0];
auto pic = pia;
auto wc = tlib::detail::generate_strides(nc,pic);
auto wc = detail::generate_strides(nc,pic);
auto nnc = std::accumulate(nc.begin(),nc.end(),1ul,std::multiplies<>()); // number of elements of input tensor


Expand All @@ -43,15 +44,15 @@ int main()
std::cout << "A = [ "; std::copy(A.begin(), A.end(), iterator_t(std::cout, " ")); std::cout << " ];" << std::endl;
std::cout << "B = [ "; std::copy(B.begin(), B.end(), iterator_t(std::cout, " ")); std::cout << " ];" << std::endl;

tlib::ttm(
tlib::parallel_policy::parallel_blas , tlib::slicing_policy::slice, tlib::fusion_policy::none,
ttm(
parallel_policy::parallel_blas , slicing_policy::slice, fusion_policy::none,
q, p,
A.data(), na.data(), wa.data(), pia.data(),
B.data(), nb.data(), pib.data(),
C1.data(), nc.data(), wc.data());

tlib::ttm(
tlib::parallel_policy::parallel_loop, tlib::slicing_policy::subtensor, tlib::fusion_policy::all,
ttm(
parallel_policy::parallel_loop, slicing_policy::subtensor, fusion_policy::all,
q, p,
A.data(), na.data(), wa.data(), pia.data(),
B.data(), nb.data(), pib.data(),
Expand Down
33 changes: 17 additions & 16 deletions example/measure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <string>
#include <chrono> // for high precision timing

using namespace tlib::ttm;

static const auto gdims = std::string("abcdefghij");

inline
Expand Down Expand Up @@ -72,9 +74,9 @@ get_gflops(double nn, double cdimc, double cdima)

template<class value, class parallel_policy, class slicing_policy, class fusion_policy>
inline void measure(unsigned q,
tlib::tensor<value> const& A,
tlib::tensor<value> const& B,
tlib::tensor<value>& C,
tensor<value> const& A,
tensor<value> const& B,
tensor<value>& C,
parallel_policy pp,
slicing_policy sp,
fusion_policy fp)
Expand All @@ -87,8 +89,7 @@ inline void measure(unsigned q,
for(auto i = 0u; i < iters; ++i){
std::fill(cache.begin(), cache.end(),char{});
auto start = std::chrono::high_resolution_clock::now();
tlib::ttm(
pp, sp, fp,
ttm(pp, sp, fp,
q, A.order(),
A.data().data(), A.shape().data(), A.strides().data(), A.layout().data(),
B.data().data(), B.shape().data(), B.layout().data(),
Expand All @@ -109,7 +110,7 @@ inline void measure(unsigned q,
std::cout << "Time : " << avg_time_s << " [s]" << std::endl;
std::cout << "Gflops : " << gflops << " [gflops]" << std::endl;
std::cout << "Performance : " << gflops/avg_time_s << " [gflops/s]" << std::endl;
std::cout << "Performance : " << gflops/avg_time_s/tlib::detail::cores << " [gflops/s/core]" << std::endl;
std::cout << "Performance : " << gflops/avg_time_s/detail::cores << " [gflops/s/core]" << std::endl;
}


Expand All @@ -122,7 +123,7 @@ int main(int argc, char* argv[])
{

using value = double;
using tensor = tlib::tensor<value>; // or std::array<value_t,N>
using tensor = tensor<value>; // or std::array<value_t,N>
using shape = typename tensor::shape_t;

assert(argc > 4);
Expand Down Expand Up @@ -159,9 +160,9 @@ int main(int argc, char* argv[])
const auto pc = pa;

// layout tuple for A and C
const auto pia = tlib::detail::generate_k_order_layout(pa,1ul);
const auto pib = tlib::detail::generate_k_order_layout(pb,1ul);
const auto pic = tlib::detail::generate_k_order_layout(pc,1ul);
const auto pia = detail::generate_k_order_layout(pa,1ul);
const auto pib = detail::generate_k_order_layout(pb,1ul);
const auto pic = detail::generate_k_order_layout(pc,1ul);

auto A = tensor( na, pia );
auto B = tensor( nb, pib );
Expand All @@ -172,37 +173,37 @@ int main(int argc, char* argv[])

if(method == 1 || method == 7){
std::cout << "Algorithm: <par-loop | slice-2d, all>" << std::endl;
measure(q, A, B, C, tlib::parallel_policy::parallel_loop, tlib::slicing_policy::slice, tlib::fusion_policy::all );
measure(q, A, B, C, parallel_policy::parallel_loop, slicing_policy::slice, fusion_policy::all );
std::cout << "---------" << std::endl << std::endl;
}

if(method == 2 || method == 7){
std::cout << "Algorithm: <par-loop | subtensor, all>" << std::endl;
measure(q, A, B, C, tlib::parallel_policy::parallel_loop, tlib::slicing_policy::subtensor, tlib::fusion_policy::all );
measure(q, A, B, C, parallel_policy::parallel_loop, slicing_policy::subtensor, fusion_policy::all );
std::cout << "---------" << std::endl << std::endl;
}

if(method == 3 || method == 7){
std::cout << "Algorithm: <par-gemm | slice-2d, none>" << std::endl;
measure(q, A, B, C, tlib::parallel_policy::parallel_blas, tlib::slicing_policy::slice, tlib::fusion_policy::none );
measure(q, A, B, C, parallel_policy::parallel_blas, slicing_policy::slice, fusion_policy::none );
std::cout << "---------" << std::endl << std::endl;
}

if(method == 4 || method == 7){
std::cout << "Algorithm: <par-gemm | slice-2d, all>" << std::endl;
measure(q, A, B, C, tlib::parallel_policy::parallel_blas, tlib::slicing_policy::slice, tlib::fusion_policy::all );
measure(q, A, B, C, parallel_policy::parallel_blas, slicing_policy::slice, fusion_policy::all );
std::cout << "---------" << std::endl << std::endl;
}

if(method == 5 || method == 7){
std::cout << "Algorithm: <par-gemm | subtensor, none>" << std::endl;
measure(q, A, B, C, tlib::parallel_policy::parallel_blas, tlib::slicing_policy::subtensor, tlib::fusion_policy::none );
measure(q, A, B, C, parallel_policy::parallel_blas, slicing_policy::subtensor, fusion_policy::none );
std::cout << "---------" << std::endl << std::endl;
}

if(method == 6 || method == 7){
std::cout << "Algorithm: <par-gemm | slice-qd, all>" << std::endl;
measure(q, A, B, C, tlib::parallel_policy::parallel_blas, tlib::slicing_policy::subtensor, tlib::fusion_policy::all );
measure(q, A, B, C, parallel_policy::parallel_blas, slicing_policy::subtensor, fusion_policy::all );
std::cout << "---------" << std::endl << std::endl;
}

Expand Down
22 changes: 3 additions & 19 deletions include/tlib/detail/cases.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@

#include <stdexcept>

namespace tlib::detail{
namespace tlib::ttm::detail{


template<unsigned case_nr>
inline constexpr bool is_case(unsigned p, std::size_t q, std::size_t const*const pi)
{
static_assert(case_nr > 0u || case_nr < 9u, "tlib::detail::is_case: only 8 cases from 1 to 8 are covered.");
static_assert(case_nr > 0u || case_nr < 9u, "tlib::ttm::detail::is_case: only 8 cases from 1 to 8 are covered.");
if constexpr (case_nr == 1u) return p==1u;
if constexpr (case_nr == 2u) return p==2u && q == 1u && pi[0] == 1u;
if constexpr (case_nr == 3u) return p==2u && q == 2u && pi[0] == 1u;
Expand All @@ -36,20 +36,4 @@ inline constexpr bool is_case(unsigned p, std::size_t q, std::size_t const*const
if constexpr (case_nr == 8u) return p>=3u && !(is_case<6u>(p,q,pi)||is_case<7u>(p,q,pi));
}


//// assume that the input matrix (2nd argument) with a column-major format
//template<unsigned case_nr>
//inline constexpr bool is_case(unsigned p, std::size_t q, std::size_t const*const pi)
//{
// static_assert(case_nr > 0u || case_nr < 9u, "tlib::detail::is_case: only 8 cases from 1 to 8 are covered.");
// if constexpr (case_nr == 1u) return p==1u;
// if constexpr (case_nr == 2u) return p==2u && q == 1u && pi[0] == 1u;
// if constexpr (case_nr == 3u) return p==2u && q == 2u && pi[0] == 1u;
// if constexpr (case_nr == 4u) return p==2u && q == 1u && pi[0] == 2u;
// if constexpr (case_nr == 5u) return p==2u && q == 2u && pi[0] == 2u;
// if constexpr (case_nr == 6u) return p>=3u && pi[0] == q;
// if constexpr (case_nr == 7u) return p>=3u && pi[p-1] == q;
// if constexpr (case_nr == 8u) return p>=3u && !(is_case<6u>(p,q,pi)||is_case<7u>(p,q,pi));
//}

} // namespace tlib::detail
} // namespace tlib::ttm::detail
4 changes: 2 additions & 2 deletions include/tlib/detail/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#pragma once

namespace tlib::detail
namespace tlib::ttm::detail
{


Expand Down Expand Up @@ -126,4 +126,4 @@ constexpr auto at_at_1(size_type const j_view, container_type const& w_view, con



} // namespace detail
} // namespace tlib::ttm::detail
12 changes: 6 additions & 6 deletions include/tlib/detail/layout.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@



namespace tlib::detail
namespace tlib::ttm::detail
{

template<class InputIt>
Expand Down Expand Up @@ -60,7 +60,7 @@ inline void compute_k_order_layout(OutputIt begin, OutputIt end, size_t k)
auto const n_signed = std::distance(begin,end);

if(n_signed <= 0)
throw std::runtime_error("Error in tlib::detail::compute_k_order: range provided by begin and end not correct!");
throw std::runtime_error("Error in tlib::ttm::detail::compute_k_order: range provided by begin and end not correct!");

auto const n = static_cast<std::make_unsigned_t<decltype(n_signed)>>(n_signed);
assert(n > 0);
Expand Down Expand Up @@ -122,16 +122,16 @@ inline auto inverse_mode(InputIt layout_begin, InputIt layout_end, SizeType mode
{
using value_type = typename std::iterator_traits<InputIt>::value_type;
if(!is_valid_layout(layout_begin,layout_end))
throw std::runtime_error("Error in tlib::detail::inverse_mode(): input layout is not valid.");
throw std::runtime_error("Error in tlib::ttm::detail::inverse_mode(): input layout is not valid.");

auto const p_ = std::distance(layout_begin,layout_end);
if(p_<= 0)
throw std::runtime_error("Error in tlib::detail::inverse_mode(): input layout is invalid.");
throw std::runtime_error("Error in tlib::ttm::detail::inverse_mode(): input layout is invalid.");

auto const p = static_cast<value_type>(p_);

if(mode==0u || mode > SizeType(p))
throw std::runtime_error("Error in tlib::detail::inverse_mode(): mode should be one-based and equal to or less than layout size.");
throw std::runtime_error("Error in tlib::ttm::detail::inverse_mode(): mode should be one-based and equal to or less than layout size.");

auto inverse_mode = value_type{0u};
for(; inverse_mode < p; ++inverse_mode)
Expand All @@ -146,4 +146,4 @@ inline auto inverse_mode(InputIt layout_begin, InputIt layout_end, SizeType mode



} // namespace tlib::detail
} // namespace tlib::ttm::detail
4 changes: 2 additions & 2 deletions include/tlib/detail/mtm.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@



namespace tlib::detail {
namespace tlib::ttm::detail {

struct cblas_layout {};

Expand Down Expand Up @@ -223,4 +223,4 @@ inline void mtm_cm(unsigned const q, unsigned const p,
}


} // namespace tlib::detail
} // namespace tlib::ttm::detail
4 changes: 2 additions & 2 deletions include/tlib/detail/shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include <vector>


namespace tlib::detail
namespace tlib::ttm::detail
{

template<class InputIt>
Expand Down Expand Up @@ -93,4 +93,4 @@ inline bool is_tensor(InputIt begin, InputIt end)



} // namespace tlib::detail
} // namespace tlib::ttm::detail
4 changes: 2 additions & 2 deletions include/tlib/detail/strides.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include "layout.h"


namespace tlib::detail
namespace tlib::ttm::detail
{

template<class InputIt1, class InputIt2, class OutputIt>
Expand Down Expand Up @@ -100,4 +100,4 @@ inline bool is_valid_strides(InputIt1 layout_begin, InputIt1 layout_end, InputIt
// [stride_begin]( auto l ) {return stride_begin[l-2] > stride_begin[l-1];} );
}

} // namespace tlib::detail
} // namespace tlib::ttm::detail
Loading

0 comments on commit fb3c0fa

Please sign in to comment.