Skip to content
This repository has been archived by the owner on Jun 28, 2024. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'openai/main' into IFU-230517
Browse files Browse the repository at this point in the history
Conflicts:
	lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp
	lib/Target/LLVMIR/LLVMIRTranslation.cpp
	python/test/unit/language/assert_helper.py
	python/triton/third_party/cuda/bin/ptxas
	test/Conversion/tritongpu_to_llvm.mlir

 It looks like you may be committing a merge.
 If this is not correct, please remove the file
	.git/MERGE_HEAD
 and try again.
  • Loading branch information
jayfurmanek committed May 17, 2023
2 parents dbf6a63 + 17eb982 commit 4c4e42e
Show file tree
Hide file tree
Showing 99 changed files with 4,563 additions and 1,253 deletions.
16 changes: 14 additions & 2 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ jobs:
id: set-matrix
run: |
if [ x"${{ github.repository }}" == x"openai/triton" ]; then
echo '::set-output name=matrix::[["self-hosted", "A100"], ["self-hosted", "V100"], ["self-hosted", "gfx908"], "macos-10.15"]'
echo '::set-output name=matrix::[["self-hosted", "A100"], ["self-hosted", "V100"], ["self-hosted", "gfx908"]]'
else
echo '::set-output name=matrix::["ubuntu-latest", "macos-10.15"]'
echo '::set-output name=matrix::["ubuntu-latest"]'
fi
Integration-Tests:
Expand Down Expand Up @@ -101,6 +101,18 @@ jobs:
cd python/test/unit
python3 -m pytest
- name: Create artifacts archive
if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100')}}
run: |
tar -czvf artifacts.tar.gz ~/.triton/cache
- name: Upload artifacts archive
if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100')}}
uses: actions/upload-artifact@v2
with:
name: artifacts
path: artifacts.tar.gz

- name: Run CXX unittests
if: ${{ env.BACKEND != 'ROCM'}}
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
#export CIBW_MANYLINUX_PYPY_X86_64_IMAGE="quay.io/pypa/manylinux2014_x86_64:latest"
export CIBW_BEFORE_BUILD="pip install cmake;"
export CIBW_SKIP="{cp,pp}35-*"
export CIBW_BUILD="{cp,pp}3*-manylinux_x86_64"
export CIBW_BUILD="{cp,pp}3*-manylinux_x86_64 cp3*-musllinux_x86_64"
python3 -m cibuildwheel python --output-dir wheelhouse
Expand Down
3 changes: 0 additions & 3 deletions .gitmodules

This file was deleted.

6 changes: 0 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
# Third-party
include_directories(${PYBIND11_INCLUDE_DIR})

if(WIN32)
SET(BUILD_SHARED_LIBS OFF)
find_package(dlfcn-win32 REQUIRED)
set(CMAKE_DL_LIBS dlfcn-win32::dl)
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS -fPIC -std=gnu++17 -fvisibility=hidden -fvisibility-inlines-hidden")

if (TRITON_USE_ROCM)
Expand Down
1 change: 1 addition & 0 deletions docs/python-api/triton.language.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Shape Manipulation Ops
:nosignatures:

broadcast_to
expand_dims
reshape
ravel

Expand Down
97 changes: 77 additions & 20 deletions include/triton/Analysis/Allocation.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef TRITON_ANALYSIS_ALLOCATION_H
#define TRITON_ANALYSIS_ALLOCATION_H

#include "triton/Analysis/Utility.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
Expand Down Expand Up @@ -49,18 +50,25 @@ template <typename T> class Interval {
T End = std::numeric_limits<T>::max();
};

template <class T> Interval(T, T) -> Interval<T>;

class Allocation {
public:
/// A unique identifier for shared memory buffers
using BufferId = size_t;
using BufferIdSetT = DenseSet<BufferId>;
using FuncAllocMapT = CallGraph<Allocation>::FuncDataMapT;

static constexpr BufferId InvalidBufferId =
std::numeric_limits<BufferId>::max();

Allocation() = default;
/// Creates a new Allocation analysis that computes the shared memory
/// information for all associated shared memory values.
Allocation(Operation *operation) : operation(operation) { run(); }
explicit Allocation(Operation *operation) : operation(operation) {}

/// Runs allocation analysis on the given top-level operation.
void run(FuncAllocMapT &funcAllocMap);

/// Returns the operation this analysis was constructed from.
Operation *getOperation() const { return operation; }
Expand All @@ -75,6 +83,12 @@ class Allocation {
return bufferSet.at(bufferId).size;
}

/// Returns the allocated interval of the given buffer.
Interval<size_t> getAllocatedInterval(BufferId bufferId) const {
auto &buffer = bufferSet.at(bufferId);
return Interval<size_t>(buffer.offset, buffer.offset + buffer.size);
}

/// Returns the buffer id of the given value.
/// This interface only returns the allocated buffer id.
/// If you want to get all the buffer ids that are associated with the given
Expand Down Expand Up @@ -104,26 +118,28 @@ class Allocation {
BufferId getBufferId(Operation *operation) const {
if (opScratch.count(operation)) {
return opScratch.lookup(operation)->id;
} else if (opVirtual.count(operation)) {
return opVirtual.lookup(operation)->id;
} else {
return InvalidBufferId;
}
}

/// Returns the size of the given buffer is a virtual buffer.
bool isVirtualBuffer(BufferId bufferId) const {
return bufferSet.at(bufferId).kind == BufferT::BufferKind::Virtual;
}

/// Returns the size of total shared memory allocated
size_t getSharedMemorySize() const { return sharedMemorySize; }

bool isIntersected(BufferId lhsId, BufferId rhsId) const {
if (lhsId == InvalidBufferId || rhsId == InvalidBufferId)
return false;
auto lhsBuffer = bufferSet.at(lhsId);
auto rhsBuffer = bufferSet.at(rhsId);
return lhsBuffer.intersects(rhsBuffer);
}

private:
/// A class that represents a shared memory buffer
struct BufferT {
enum class BufferKind { Explicit, Scratch };
/// Explicit: triton_gpu.alloc_tensor
/// Scratch: triton_gpu.convert_layout
/// Virtual: triton.call
enum class BufferKind { Explicit, Scratch, Virtual };

/// MT: thread-safe
inline static std::atomic<BufferId> nextId = 0;
Expand All @@ -142,12 +158,6 @@ class Allocation {
BufferT(BufferKind kind, size_t size) : BufferT(kind, size, 0) {}
BufferT(BufferKind kind, size_t size, size_t offset)
: kind(kind), id(nextId++), size(size), offset(offset) {}

bool intersects(const BufferT &other) const {
return Interval<size_t>(offset, offset + size)
.intersects(
Interval<size_t>(other.offset, other.offset + other.size));
}
};

/// Op -> Scratch Buffer
Expand All @@ -158,8 +168,6 @@ class Allocation {
using AliasBufferMapT = llvm::MapVector<Value, llvm::SetVector<BufferT *>>;
/// BufferId -> Buffer
using BufferSetT = std::map<BufferId, BufferT>;
/// Runs allocation analysis on the given top-level operation.
void run();

private:
template <BufferT::BufferKind Kind, typename KeyType, typename... Args>
Expand All @@ -168,6 +176,8 @@ class Allocation {
bufferSet[buffer.id] = std::move(buffer);
if constexpr (Kind == BufferT::BufferKind::Explicit) {
valueBuffer[key] = &bufferSet[buffer.id];
} else if constexpr (Kind == BufferT::BufferKind::Virtual) {
opVirtual[key] = &bufferSet[buffer.id];
} else {
opScratch[key] = &bufferSet[buffer.id];
}
Expand All @@ -178,8 +188,9 @@ class Allocation {
}

private:
Operation *operation;
Operation *operation = nullptr;
OpScratchMapT opScratch;
OpScratchMapT opVirtual;
ValueBufferMapT valueBuffer;
AliasBufferMapT aliasBuffer;
BufferSetT bufferSet;
Expand All @@ -188,7 +199,53 @@ class Allocation {
friend class triton::AllocationAnalysis;
};

template <typename T> Interval(T, T) -> Interval<T>;
/// Static analysis that computes the allocation of shared memory buffers
/// of the entire call graph.
/// The allocation is performed in a post-order walk of the call graph.
/// Each call op is treated like convert_layout that allocates a scratch buffer.
/// At each call, we compute the start offset of the scratch buffer and pass it
/// as an argument to the callee.
class ModuleAllocation : public CallGraph<Allocation> {
public:
using FuncOffsetMapT = DenseMap<FunctionOpInterface, Value>;

explicit ModuleAllocation(ModuleOp moduleOp)
: CallGraph<Allocation>(moduleOp) {
walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
// Pre-order edge walk callback
[](CallOpInterface callOp, FunctionOpInterface funcOp) {},
// Post-order node walk callback
[&](FunctionOpInterface funcOp) {
auto [iter, inserted] = funcMap.try_emplace(funcOp, funcOp);
if (inserted)
iter->second.run(funcMap);
});
}

size_t getSharedMemorySize() {
size_t size = 0;
for (auto funcOp : getRoots()) {
auto *alloc = getFuncData(funcOp);
size = std::max(size, alloc->getSharedMemorySize());
}
return size;
}

size_t getSharedMemorySize(FunctionOpInterface funcOp) {
return getFuncData(funcOp)->getSharedMemorySize();
}

void setFunctionSharedMemoryValue(FunctionOpInterface funcOp, Value value) {
sharedMemoryValue[funcOp] = value;
}

Value getFunctionSharedMemoryBase(FunctionOpInterface funcOp) {
return sharedMemoryValue[funcOp];
}

private:
FuncOffsetMapT sharedMemoryValue;
};

} // namespace mlir

Expand Down
55 changes: 55 additions & 0 deletions include/triton/Analysis/AxisInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,16 +286,71 @@ class AxisInfoAnalysis
AxisInfoAnalysis(DataFlowSolver &solver);
using dataflow::SparseDataFlowAnalysis<
dataflow::Lattice<AxisInfo>>::getLatticeElement;
using FuncAxisInfoMapT = DenseMap<FunctionOpInterface, AxisInfo>;

void visitOperation(Operation *op,
ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
ArrayRef<dataflow::Lattice<AxisInfo> *> results) override;
};

/// Module level axis info analysis based on the call graph, assuming that we
/// do not have recursive functions.
/// Since each function will be called multiple times, we need to
/// calculate the axis info based on the axis info of all the callers.
/// In the future, we can perform optimization using function cloning so that
/// each call site will have unique axis info.
using AxisInfoMapT = DenseMap<Value, AxisInfo>;
class ModuleAxisInfoAnalysis : public CallGraph<AxisInfoMapT> {
public:
explicit ModuleAxisInfoAnalysis(ModuleOp moduleOp)
: CallGraph<AxisInfoMapT>(moduleOp) {
SmallVector<FunctionOpInterface> funcs;
for (auto root : getRoots()) {
walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
// Pre-order edge walk callback
[](CallOpInterface callOp, FunctionOpInterface funcOp) {},
// Post-order node walk callback
[&](FunctionOpInterface funcOp) {
funcs.push_back(funcOp);
funcMap.try_emplace(funcOp, AxisInfoMapT{});
});
}
SetVector<FunctionOpInterface> sortedFuncs(funcs.begin(), funcs.end());
SymbolTableCollection symbolTable;
for (auto funcOp : llvm::reverse(sortedFuncs)) {
initialize(funcOp);
funcOp.walk([&](CallOpInterface callOp) {
auto callee =
dyn_cast<FunctionOpInterface>(callOp.resolveCallable(&symbolTable));
update(callOp, callee);
});
}
}

AxisInfo *getAxisInfo(Value value) {
auto funcOp =
value.getParentRegion()->getParentOfType<FunctionOpInterface>();
auto *axisInfoMap = getFuncData(funcOp);
if (!axisInfoMap) {
return nullptr;
}
auto it = axisInfoMap->find(value);
if (it == axisInfoMap->end()) {
return nullptr;
}
return &(it->second);
}

unsigned getPtrContiguity(Value ptr);

unsigned getPtrAlignment(Value ptr);

unsigned getMaskAlignment(Value mask);

private:
void initialize(FunctionOpInterface funcOp);

void update(CallOpInterface callOp, FunctionOpInterface funcOp);
};

} // namespace mlir
Expand Down
Loading

0 comments on commit 4c4e42e

Please sign in to comment.