Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve TessellateIPU variables naming (and profile duplication) #40

Merged
merged 1 commit into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions tessellate_ipu/lax/tile_lax_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,10 @@ def tile_sharded_identity(dtype: DTypeLike, tiles: Tuple[int, ...]) -> TileShard
# Build zero matrix + update diagonal entries.
arr = tile_fill((N,), 0, dtype=dtype, tiles=tiles)
# Requiring constants for indices + updates. Something more efficient?s
indices = tile_constant_sharded(np.arange(0, N, dtype=np.uint32).reshape(N, 1, 1), tiles=tiles)
updates = tile_constant_replicated(np.array([1], dtype=dtype), tiles=tiles)
with jax.named_scope("indices"):
indices = tile_constant_sharded(np.arange(0, N, dtype=np.uint32).reshape(N, 1, 1), tiles=tiles)
with jax.named_scope("updates"):
updates = tile_constant_replicated(np.array([1], dtype=dtype), tiles=tiles)
# Not the simplest way ever of updating diagonal terms!
scatter_dnums = jax.lax.ScatterDimensionNumbers(
update_window_dims=(), inserted_window_dims=(0,), scatter_dims_to_operand_dims=(0,)
Expand Down
23 changes: 16 additions & 7 deletions tessellate_ipu/lib/tessellate_ipu_ops_jax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ class TilePutShardedPrimitive : public TilePutBase {
poplar::Graph& graph, const std::vector<poplar::Tensor>& inputs,
std::vector<poplar::Tensor>& outputs, const std::string& attributes,
const std::string& debug_prefix) {
const auto debug_context = poplar::DebugContext(debug_prefix);
const auto debug_context = poplar::DebugContext(
makeTileOpDebugPrefix(debug_prefix, "tile_put_sharded"));
// Passing the tile array as attributes.
const auto tile_array = extractTileArray(attributes);
return lowerTilePutShardedToPoplar(graph, inputs, outputs, tile_array,
Expand All @@ -54,12 +55,15 @@ class TilePutShardedPrimitive : public TilePutBase {
poplar::Type type,
const std::string& attributes,
const std::string& debug_prefix) {
const auto debug_context = poplar::DebugContext(
makeTileOpDebugPrefix(debug_prefix, "tile_put_sharded"));
const auto tile_array = extractTileArray(attributes);
const auto item_shape =
poplar::ArrayRef<std::size_t>(shape.data() + 1, shape.size() - 1);
// If not allocated => already pre-allocate input with proper tile mapping.
// TODO: fix (unnecessary) on-tile-copy when doing that?
return createShardedVariable(graph, type, item_shape, tile_array);
return createShardedVariable(graph, type, item_shape, tile_array,
debug_context);
}
};

Expand All @@ -83,7 +87,8 @@ class TilePutReplicatedPrimitive : public TilePutBase {
poplar::Graph& graph, const std::vector<poplar::Tensor>& inputs,
std::vector<poplar::Tensor>& outputs, const std::string& attributes,
const std::string& debug_prefix) {
const auto debug_context = poplar::DebugContext(debug_prefix);
const auto debug_context = poplar::DebugContext(
makeTileOpDebugPrefix(debug_prefix, "tile_put_replicated"));
const auto tile_array = extractTileArray(attributes);
return lowerTilePutReplicatedToPoplar(graph, inputs, outputs, tile_array,
debug_context);
Expand All @@ -109,7 +114,8 @@ class TileGatherPrimitive : public jax::ipu::PrimitiveInterface {
poplar::Graph& graph, const std::vector<poplar::Tensor>& inputs,
std::vector<poplar::Tensor>& outputs, const std::string& attributes,
const std::string& debug_prefix) {
const auto debug_context = poplar::DebugContext(debug_prefix);
const auto debug_context = poplar::DebugContext(
makeTileOpDebugPrefix(debug_prefix, "tile_gather"));
// Tile gather parameters.
const auto params = ipu::from_json_str<TileGatherParams>(attributes);
return lowerTileGatherToPoplar(graph, inputs, outputs, params,
Expand Down Expand Up @@ -138,7 +144,8 @@ class TileDataBarrierPrimitive : public jax::ipu::PrimitiveInterface {
poplar::Graph& graph, const std::vector<poplar::Tensor>& inputs,
std::vector<poplar::Tensor>& outputs, const std::string& attributes,
const std::string& debug_prefix) {
const auto debug_context = poplar::DebugContext(debug_prefix);
const auto debug_context = poplar::DebugContext(
makeTileOpDebugPrefix(debug_prefix, "tile_data_barrier"));
// Tile barrier parameters (with tile sharding).
const auto params = ipu::from_json_str<TileDataBarrierParams>(attributes);
return lowerTileDataBarrierToPoplar(graph, inputs, outputs, params,
Expand All @@ -165,7 +172,8 @@ class TileConstantReplicatedPrimitive : public jax::ipu::PrimitiveInterface {
poplar::Graph& graph, const std::vector<poplar::Tensor>& inputs,
std::vector<poplar::Tensor>& outputs, const std::string& attributes,
const std::string& debug_prefix) {
const auto debug_context = poplar::DebugContext(debug_prefix);
const auto debug_context = poplar::DebugContext(
makeTileOpDebugPrefix(debug_prefix, "tile_constant_replicated"));
const auto params = ipu::from_json_str<TileConstantParams>(attributes);
return lowerTileConstantReplicatedToPoplar(graph, inputs, outputs, params,
debug_context);
Expand All @@ -191,7 +199,8 @@ class TileConstantShardedPrimitive : public jax::ipu::PrimitiveInterface {
poplar::Graph& graph, const std::vector<poplar::Tensor>& inputs,
std::vector<poplar::Tensor>& outputs, const std::string& attributes,
const std::string& debug_prefix) {
const auto debug_context = poplar::DebugContext(debug_prefix);
const auto debug_context = poplar::DebugContext(
makeTileOpDebugPrefix(debug_prefix, "tile_constant_sharded"));
const auto params = ipu::from_json_str<TileConstantParams>(attributes);
return lowerTileConstantShardedToPoplar(graph, inputs, outputs, params,
debug_context);
Expand Down
25 changes: 22 additions & 3 deletions tessellate_ipu/lib/tile_array_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,25 @@

namespace ipu {

std::string makeTileOpDebugPrefix(const std::string& raw_debug_prefix,
const std::string& basename) {
const auto format_debug_prefix = [&raw_debug_prefix,
&basename](std::size_t idx) {
const std::string debug_prefix =
fmt::format("{}{}", raw_debug_prefix.substr(0, idx), basename);
return debug_prefix;
};
std::string::size_type idx;
// A bit of ugly string pattern matching to remove the metadata, but keep
// the existing namespace.
idx = raw_debug_prefix.rfind(basename + "[");
if (idx != std::string::npos) {
return format_debug_prefix(idx);
}
// Not found => keep the same debug prefix.
return raw_debug_prefix;
}

poplar::Tensor tileBarrierReinterpretTensor(const poplar::Tensor& t,
bool is_half_accurate) {
// 8 bits data types.
Expand Down Expand Up @@ -69,8 +88,8 @@ poplar::program::Program lowerTilePutShardedToPoplar(

// Create output tensor, with proper tile mapping.
// TODO: link to Slack discussion on VarRegion contiguity.
auto output = createShardedVariable(graph, input.elementType(),
input[0].shape(), tile_array);
auto output = createShardedVariable(
graph, input.elementType(), input[0].shape(), tile_array, debug_context);
// Copy data tensor into the output.
auto prog = poplar::program::Copy(input, output);
outputs.push_back(output);
Expand All @@ -91,7 +110,7 @@ poplar::program::Program lowerTilePutReplicatedToPoplar(
// Create output tensor, with proper tile mapping.
auto input_broadcasted = input.expand({0}).broadcast(tile_array.size(), 0);
auto output = createShardedVariable(graph, input.elementType(), input.shape(),
tile_array);
tile_array, debug_context);
// Copy data tensor into the output.
auto prog = poplar::program::Copy(input_broadcasted, output, false);
outputs.push_back(output);
Expand Down
8 changes: 8 additions & 0 deletions tessellate_ipu/lib/tile_array_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@
#include "base_types.hpp"

namespace ipu {

/**
* @brief Make a (readable/clean) tile op debug prefix.
* Help having a more readable naming in PopVision profile.
*/
std::string makeTileOpDebugPrefix(const std::string& raw_debug_prefix,
const std::string& basename);

/**
* @brief IPU tile gather op parameters.
*/
Expand Down
48 changes: 21 additions & 27 deletions tessellate_ipu/lib/tile_array_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,44 +55,38 @@ poplar::Tensor createReplicatedConstantTensor(
poplar::ArrayRef<TileIndexType> tiles,
const poplar::DebugContext& debug_context) {
// TODO: check raw_values, dtype and shape are consistent.
// TODO: get it working with FP16!
// Expanded shape (used in concat).
const auto expand_shape = shapePrependAxis(1, shape);
// Create Poplar constant per tile. Should I create a single one?
std::vector<poplar::Tensor> tensor_list;
// Replicating raw values on the host. Should never be >1GB (worse case!).
// Allows creating a single constant tensor, which is better for Popvision
// profile.
std::vector<char> replicated_raw_values(raw_values.size() * tiles.size());
auto it = replicated_raw_values.begin();
for (size_t idx = 0; idx < tiles.size(); ++idx) {
auto t = createConstantTensor(graph, ipu_type, expand_shape, raw_values,
debug_context);
graph.setTileMapping(t, tiles[idx]);
tensor_list.push_back(t);
it = std::copy(raw_values.begin(), raw_values.end(), it);
}
return poplar::concat(tensor_list, 0);
// Build the full constant tensor at once.
// TODO: make sure it works with FP16?
const auto replicated_shape = shapePrependAxis(tiles.size(), shape);
auto t = createConstantTensor(graph, ipu_type, replicated_shape,
replicated_raw_values, debug_context);
for (size_t idx = 0; idx < tiles.size(); ++idx) {
graph.setTileMapping(t[idx], tiles[idx]);
}
return t;
}

poplar::Tensor createShardedConstantTensor(
poplar::Graph& graph, const IpuType& ipu_type,
poplar::ArrayRef<std::size_t> shape, poplar::ArrayRef<char> raw_values,
poplar::ArrayRef<TileIndexType> tiles,
const poplar::DebugContext& debug_context) {
// TODO: check consistent raw values size.
// Expanded shape on every tile.
const auto expand_shape =
shapePrependAxis(1, arraySlice(shape, 1, shape.size()));
const auto dtype_size = ipuTypeSize(ipu_type);
const std::size_t bytes_size = sizeFromShape(expand_shape) * dtype_size;
auto poplar_type = toPoplar(ipu_type);
// Create Poplar constant per tile. Should I create a single one?
std::vector<poplar::Tensor> tensor_list;
// TODO: check raw_values, dtype and shape are consistent.
// Creating a single tensor, to avoid Popvision profile bloating.
auto t =
createConstantTensor(graph, ipu_type, shape, raw_values, debug_context);
for (size_t idx = 0; idx < tiles.size(); ++idx) {
// Slicing the raw data corresponding to the tile.
auto raw_values_tile =
arraySlice(raw_values, idx * bytes_size, (idx + 1) * bytes_size);
auto t = createConstantTensor(graph, ipu_type, expand_shape,
raw_values_tile, debug_context);
graph.setTileMapping(t, tiles[idx]);
tensor_list.push_back(t);
graph.setTileMapping(t[idx], tiles[idx]);
}
return poplar::concat(tensor_list, 0);
return t;
}

} // namespace ipu
2 changes: 0 additions & 2 deletions tessellate_ipu/lib/tile_map_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ std::string makeTileMapCallDebugPrefix(const std::string& raw_debug_prefix,
const std::string& primitive_name) {
const auto format_debug_prefix = [&raw_debug_prefix,
&primitive_name](std::size_t idx) {
// const std::string debug_prefix = raw_debug_prefix.substr(0, idx) +
// "tile_map";
const std::string debug_prefix =
fmt::format("{}{}[{}]", raw_debug_prefix.substr(0, idx), "tile_map",
primitive_name);
Expand Down