From f3bd8e681f8bf70141012c5fe621416a36edca46 Mon Sep 17 00:00:00 2001 From: void-main Date: Sun, 23 Apr 2023 10:27:34 +0000 Subject: [PATCH 01/21] get llama coded --- examples/cpp/CMakeLists.txt | 2 + examples/cpp/llama/CMakeLists.txt | 22 + examples/cpp/llama/bad_words.csv | 2 + examples/cpp/llama/check_with_huggingface.py | 15 + .../cpp/llama/huggingface_llama_convert.py | 187 +++ examples/cpp/llama/llama_config.ini | 32 + examples/cpp/llama/llama_example.cc | 503 +++++++ examples/cpp/llama/llama_triton_example.cc | 457 +++++++ examples/cpp/llama/model_config.json | 1 + examples/cpp/llama/start_ids.csv | 8 + examples/cpp/llama/stop_words.csv | 2 + src/fastertransformer/models/CMakeLists.txt | 2 + .../models/llama/CMakeLists.txt | 69 + src/fastertransformer/models/llama/Llama.cc | 1211 +++++++++++++++++ src/fastertransformer/models/llama/Llama.h | 218 +++ .../models/llama/LlamaContextDecoder.cc | 504 +++++++ .../models/llama/LlamaContextDecoder.h | 117 ++ .../models/llama/LlamaDecoder.cc | 381 ++++++ .../models/llama/LlamaDecoder.h | 104 ++ .../models/llama/LlamaDecoderLayerWeight.cc | 225 +++ .../models/llama/LlamaDecoderLayerWeight.h | 62 + .../models/llama/LlamaWeight.cc | 301 ++++ .../models/llama/LlamaWeight.h | 106 ++ .../triton_backend/CMakeLists.txt | 2 + 24 files changed, 4533 insertions(+) create mode 100644 examples/cpp/llama/CMakeLists.txt create mode 100644 examples/cpp/llama/bad_words.csv create mode 100644 examples/cpp/llama/check_with_huggingface.py create mode 100644 examples/cpp/llama/huggingface_llama_convert.py create mode 100644 examples/cpp/llama/llama_config.ini create mode 100644 examples/cpp/llama/llama_example.cc create mode 100644 examples/cpp/llama/llama_triton_example.cc create mode 100644 examples/cpp/llama/model_config.json create mode 100644 examples/cpp/llama/start_ids.csv create mode 100644 examples/cpp/llama/stop_words.csv create mode 100644 src/fastertransformer/models/llama/CMakeLists.txt create mode 100644 src/fastertransformer/models/llama/Llama.cc create mode 100644 src/fastertransformer/models/llama/Llama.h create mode 100644 src/fastertransformer/models/llama/LlamaContextDecoder.cc create mode 100644 src/fastertransformer/models/llama/LlamaContextDecoder.h create mode 100644 src/fastertransformer/models/llama/LlamaDecoder.cc create mode 100644 src/fastertransformer/models/llama/LlamaDecoder.h create mode 100644 src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc create mode 100644 src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h create mode 100644 src/fastertransformer/models/llama/LlamaWeight.cc create mode 100644 src/fastertransformer/models/llama/LlamaWeight.h diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index da24d72c6..64da9d2e7 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -28,6 +28,8 @@ add_subdirectory(gptj) add_subdirectory(gptneox) add_subdirectory(multi_gpu_gpt) +add_subdirectory(llama) + if(ENABLE_FP8) add_subdirectory(gpt_fp8) add_subdirectory(bert_fp8) diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt new file mode 100644 index 000000000..0495d3bf2 --- /dev/null +++ b/examples/cpp/llama/CMakeLists.txt @@ -0,0 +1,22 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(llama_example llama_example.cc) +target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart + Llama nvtx_utils gpt_example_utils word_list mpi_utils nccl_utils) + +# add_executable(llama_triton_example llama_triton_example.cc) +# target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread +# LlamaTritonBackend TransformerTritonBackend custom_ar_comm +# gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils) diff --git a/examples/cpp/llama/bad_words.csv b/examples/cpp/llama/bad_words.csv new file mode 100644 index 000000000..6a1126ebd --- /dev/null +++ b/examples/cpp/llama/bad_words.csv @@ -0,0 +1,2 @@ +7768,3908 +1,2 diff --git a/examples/cpp/llama/check_with_huggingface.py b/examples/cpp/llama/check_with_huggingface.py new file mode 100644 index 000000000..0ba69036e --- /dev/null +++ b/examples/cpp/llama/check_with_huggingface.py @@ -0,0 +1,15 @@ +import transformers +import torch + +from transformers import LlamaForCausalLM, LlamaTokenizer + +tokenizer = LlamaTokenizer.from_pretrained('/data/llama-7b-hf') +prompt = "Hey" +inputs = tokenizer(prompt, return_tensors='pt') +print(inputs) + +model = LlamaForCausalLM.from_pretrained("/data/llama-7b-hf") +generated_ids = model.generate(inputs.input_ids, max_length=10) +print(generated_ids) +output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] +print(output) diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py new file mode 100644 index 000000000..28a552a28 --- /dev/null +++ b/examples/cpp/llama/huggingface_llama_convert.py @@ -0,0 +1,187 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import configparser +import numpy as np +from pathlib import Path + +import os +from transformers import LlamaForCausalLM + +def get_weight_data_type(data_type): + if data_type == "fp32": + return np.float32 + elif data_type == "fp16": + return np.float16 + else: + assert False, f"Invalid weight data type {data_type}" + + +def split_and_convert_process(saved_dir, factor, key, val): + if key.find("input_layernorm.weight") != -1 or key.find("post_attention_layernorm.weight") != -1: + # shared weights, only need to convert the weights of rank 0 + saved_path = saved_dir + "/" + key + ".bin" + val.tofile(saved_path) + elif key.find("attention.dense.weight") != -1 or key.find("mlp.down_proj.weight") != -1: + split_vals = np.split(val, factor, axis=0) + for j in range(factor): + saved_path = saved_dir + "/" + key + ".%d.bin" % j + split_vals[j].tofile(saved_path) + elif key.find("mlp.gate_proj.weight") != -1 or key.find("mlp.up_proj.weight") != -1: + split_vals = np.split(val, factor, axis=-1) + for j in range(factor): + saved_path = saved_dir + "/" + key + ".%d.bin" % j + split_vals[j].tofile(saved_path) + elif key.find("attention.query_key_value.weight") != -1: + split_vals = np.split(val, factor, axis=-1) + for j in range(factor): + saved_path = saved_dir + "/" + key + ".%d.bin" % j + split_vals[j].tofile(saved_path) + else: + print("[ERROR] cannot find key '{}'".format(key)) + +def split_and_convert(args): + saved_dir = args.saved_dir + "/%d-gpu/" % args.infer_gpu_num + + if(os.path.exists(saved_dir) == False): + os.makedirs(saved_dir) + + t_gpu_num = args.trained_gpu_num + i_gpu_num = args.infer_gpu_num + assert(i_gpu_num % t_gpu_num == 0) + + factor = (int)(i_gpu_num / t_gpu_num) + + # load position_embedding from rank 0 + # model = torch.load(ckpt_name) + model = LlamaForCausalLM.from_pretrained(args.in_file) + hf_config = vars(model.config) + print(f"hf_config: {hf_config}") + + print("named parameters:") + for name, param in model.named_parameters(): + print(f"- {name}") + + hidden_size = hf_config["hidden_size"] + head_num = hf_config["num_attention_heads"] + head_size = hidden_size // head_num + num_layers = hf_config["num_hidden_layers"] + + + np_weight_data_type = get_weight_data_type(args.weight_data_type) + + try: + model_name = args.model_name + config = configparser.ConfigParser() + config['llama'] = {} + config['llama']['model_name'] = model_name + config['llama']["head_num"] = str(head_num) + config['llama']["size_per_head"] = str(head_size) + config['llama']["inter_size"] = str(hf_config["intermediate_size"]) + config['llama']["num_layer"] = str(num_layers) + config['llama']["rotary_embedding"] = str(head_size) + config['llama']["vocab_size"] = str(hf_config["vocab_size"]) + config['llama']["start_id"] = str(hf_config["bos_token_id"]) + config['llama']["end_id"] = str(hf_config["eos_token_id"]) + config['llama']["weight_data_type"] = args.weight_data_type + + with open((Path(saved_dir) / f"config.ini").as_posix(), 'w') as configfile: + config.write(configfile) + except Exception as e: + print(f"Fail to save the config in config.ini.") + print(e) + + param_to_weights = lambda param: param.detach().cpu().numpy().astype(np_weight_data_type) + + # layer-wise weights, example: + # - model.layers.0.self_attn.q_proj.weight + # - model.layers.0.self_attn.k_proj.weight + # - model.layers.0.self_attn.v_proj.weight + # - model.layers.0.self_attn.o_proj.weight + # - model.layers.0.mlp.gate_proj.weight + # - model.layers.0.mlp.down_proj.weight + # - model.layers.0.mlp.up_proj.weight + # - model.layers.0.input_layernorm.weight + # - model.layers.0.post_attention_layernorm.weight + for l in range(num_layers): + print(f"converting layer {l}") + # first merge QKV into a single weight + # concat direct to FT shape: [hidden_size, 3, head_num, head_size] + qkv_weights = np.empty((hidden_size, 3, head_num, head_size), dtype=np_weight_data_type) + q_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight']) + k_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight']) + v_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight']) + qkv_weights[:, 0, :, :] = q_weight.reshape(hidden_size, head_num, head_size) + qkv_weights[:, 1, :, :] = k_weight.reshape(hidden_size, head_num, head_size) + qkv_weights[:, 2, :, :] = v_weight.reshape(hidden_size, head_num, head_size) + qkv_weights_base_name = f'model.layers.{l}.attention.query_key_value.weight' + split_and_convert_process(saved_dir, factor, qkv_weights_base_name, qkv_weights) + + # attention dense + o_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']) + o_weight_base_name = f'model.layers.{l}.attention.dense.weight' + split_and_convert_process(saved_dir, factor, o_weight_base_name, o_weight) + + # MLP + mlp_down_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']) + mlp_down_base_name = f'model.layers.{l}.mlp.down_proj.weight' + split_and_convert_process(saved_dir, factor, mlp_down_base_name, mlp_down_weight) + + mlp_gate_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']) + mlp_gate_base_name = f'model.layers.{l}.mlp.gate_proj.weight' + split_and_convert_process(saved_dir, factor, mlp_gate_base_name, mlp_gate_weight) + + mlp_up_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']) + mlp_up_base_name = f'model.layers.{l}.mlp.up_proj.weight' + split_and_convert_process(saved_dir, factor, mlp_up_base_name, mlp_up_weight) + + # LayerNorm + input_ln_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.input_layernorm.weight']) + input_ln_base_name = f'model.layers.{l}.input_layernorm.weight' + split_and_convert_process(saved_dir, factor, input_ln_base_name, input_ln_weight) + + post_attn_ln_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight']) + post_attn_ln_base_name = f'model.layers.{l}.post_attention_layernorm.weight' + split_and_convert_process(saved_dir, factor, post_attn_ln_base_name, post_attn_ln_weight) + + print(f"done layer {l}") + + + # final common weights + for name, param in model.named_parameters(): + if name == 'model.embed_tokens.weight': + param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.weight.bin") + elif name == 'model.norm.weight': + param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.weight.bin") + elif name == 'lm_head.weight': + param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('-saved_dir', '-o', type=str, help='file name of output file', required=True) + parser.add_argument('-in_file', '-i', type=str, help='file name of input checkpoint file', required=True) + parser.add_argument('-trained_gpu_num', '-t_g', type=int, help='How many gpus for inference', default=1) + parser.add_argument('-infer_gpu_num', '-i_g', type=int, help='How many gpus for inference', required=True) + parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"]) + parser.add_argument('-model_name', '-m_n', type=str, help='model name', required=True) + + args = parser.parse_args() + print("\n=============== Argument ===============") + for key in vars(args): + print("{}: {}".format(key, vars(args)[key])) + print("========================================") + + split_and_convert(args) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini new file mode 100644 index 000000000..5882e03a4 --- /dev/null +++ b/examples/cpp/llama/llama_config.ini @@ -0,0 +1,32 @@ +[ft_instance_hyperparameter] +data_type=fp16 +enable_custom_all_reduce=0 + +tensor_para_size=1 +pipeline_para_size=1 + +model_name=llama_7b +model_dir=/data/llama-7b-hf-converted + +[request] +beam_width=1 # beam width for beam search +top_k=1 ; k value for top k sampling +top_p=0.0 ; p value for top p sampling +temperature=1.0 ; Use for sampling +repetition_penalty=1.0 ; Use for sampling +presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. +len_penalty=0.0 +beam_search_diversity_rate=0.0 +request_batch_size=8 # determine by the request +request_output_len=32 # determine by the request + +[llama_7b] +head_num = 32 +size_per_head = 128 +inter_size = 11008 +num_layer = 32 +rotary_embedding = 128 +vocab_size = 32000 +start_id = 0 +end_id = 1 +weight_data_type = fp16 diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc new file mode 100644 index 000000000..c72a7a8b7 --- /dev/null +++ b/examples/cpp/llama/llama_example.cc @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "3rdparty/INIReader.h" +#include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h" +#include "src/fastertransformer/models/llama/Llama.h" +#include "src/fastertransformer/utils/mpi_utils.h" +#include "src/fastertransformer/utils/nccl_utils.h" +#include "src/fastertransformer/utils/nvtx_utils.h" +#include "src/fastertransformer/utils/word_list.h" + +#include +#include +#include +#include +#include +#include + +using namespace fastertransformer; + +template +void llama_example(const INIReader reader); + +int main(int argc, char* argv[]) +{ + mpi::initialize(&argc, &argv); + srand(0); + + std::string ini_name; + if (argc == 2) { + ini_name = std::string(argv[1]); + } + else { + ini_name = "../examples/cpp/llama/llama_config.ini"; + } + + INIReader reader = INIReader(ini_name); + if (reader.ParseError() < 0) { + std::cout << "[ERROR] Can't load '" << ini_name << "'\n"; + return -1; + } + const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type"); + + if (data_type == "fp32") { + llama_example(reader); + } + else if (data_type == "fp16") { + llama_example(reader); + } + else { + FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half)."); + return -1; + } + mpi::finalize(); + return 0; +} + +template +void llama_example(const INIReader reader) +{ + const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name"); + std::string model_dir = std::string(reader.Get("ft_instance_hyperparameter", "model_dir")); + + int tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"); + int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"); + + const size_t head_num = reader.GetInteger(model_name, "head_num"); + const size_t size_per_head = reader.GetInteger(model_name, "size_per_head"); + const size_t vocab_size = reader.GetInteger(model_name, "vocab_size"); + const size_t decoder_layers = reader.GetInteger(model_name, "num_layer"); + const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding"); + const int start_id = reader.GetInteger(model_name, "start_id"); + const int end_id = reader.GetInteger(model_name, "end_id"); + + const size_t hidden_units = head_num * size_per_head; + const size_t inter_size = reader.GetInteger(model_name, "inter_size"); + + const size_t beam_width = reader.GetInteger("request", "beam_width"); + const uint top_k = (uint)reader.GetInteger("request", "top_k"); + const float top_p = reader.GetFloat("request", "top_p"); + const float temperature = reader.GetFloat("request", "temperature"); + const float repetition_penalty = reader.GetFloat("request", "repetition_penalty", 1.0f); + const float presence_penalty = reader.GetFloat("request", "presence_penalty", 0.0f); + const float len_penalty = reader.GetFloat("request", "len_penalty"); + const float beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate"); + const int min_length = reader.GetInteger("request", "min_length", 0); + const size_t request_batch_size = 1; // reader.GetInteger("request", "request_batch_size"); + // The length of tokens we hope this model to generate + const int request_output_len = reader.GetInteger("request", "request_output_len"); + + FT_CHECK(head_num % tensor_para_size == 0); + FT_CHECK(decoder_layers % pipeline_para_size == 0); + FT_CHECK_WITH_INFO( + repetition_penalty == 1.0f || presence_penalty == 0.0f, + fmtstr("Found ambiguous parameters repetition_penalty (%f) and presence_penalty (%f) " + "which are mutually exclusive. Please remove one of repetition_penalty or presence_penalty " + "or set to a default value.", + repetition_penalty, + presence_penalty)); + + // Prepare the parallelism parameters + int rank = mpi::getCommWorldRank(); + int world_size = mpi::getCommWorldSize(); + if (rank == 0) { + printf("Total ranks: %d.\n", world_size); + } + int device, device_count; + check_cuda_error(cudaGetDeviceCount(&device_count)); + check_cuda_error(cudaSetDevice(rank % device_count)); + check_cuda_error(cudaGetDevice(&device)); + + struct cudaDeviceProp prop; + check_cuda_error(cudaGetDeviceProperties(&prop, device)); + printf("Device %s\n", prop.name); + + printf("P%d is running with GPU #%d.\n", rank, device); + if (tensor_para_size * pipeline_para_size != world_size) { + if (world_size % pipeline_para_size) { + printf("[ERROR] tensor_para_size * pipeline_para_size should equal to world_size \n"); + exit(-1); + } + tensor_para_size = world_size / pipeline_para_size; + printf("[INFO] Setting tensor_para_size to %d \n", tensor_para_size); + } + + const int layers_per_group = decoder_layers / pipeline_para_size; + if (layers_per_group * pipeline_para_size != (int)decoder_layers) { + printf("[ERROR] layers_per_group (%d) * pipeline_para_size (%d) should equal to decoder_layers (%ld) \n", + layers_per_group, + pipeline_para_size, + decoder_layers); + exit(-1); + } + + // assume gpu_num = k * n, + // tensor parallelism group size is n + // pipeline parallelism group size is k + NcclParam tensor_para; + NcclParam pipeline_para; + ftNcclInitialize(tensor_para, pipeline_para, tensor_para_size, pipeline_para_size); + + // Handle bad_words dictionary + std::vector bad_words; + read_word_list("../examples/cpp/llama/bad_words.csv", bad_words); + + int* d_bad_words = nullptr; + deviceMalloc(&d_bad_words, bad_words.size(), false); + cudaH2Dcpy(d_bad_words, bad_words.data(), bad_words.size()); + + // Handle stop_words dictionary + std::vector stop_words; + read_word_list("../examples/cpp/llama/stop_words.csv", stop_words); + + const size_t stop_words_len = stop_words.size() / 2; + // Tile with same dict for each element + std::vector tiled_stop_words; + for (int i = 0; i < request_batch_size; i++) { + tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end()); + } + + int* d_stop_words = nullptr; + deviceMalloc(&d_stop_words, tiled_stop_words.size(), false); + cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size()); + + // Read ids of request from file. + size_t max_input_len = -1; + std::vector v_start_lengths; + std::vector v_start_ids; + read_start_ids(request_batch_size, + &v_start_lengths, + &v_start_ids, + max_input_len, + end_id, + 1, + "../examples/cpp/llama/start_ids.csv"); + + int* d_input_ids; + int* d_input_lengths; + if (max_input_len == 0) { + // unconditional case, no input ids, so do nothing. + d_input_ids = nullptr; + d_input_lengths = nullptr; + } + else { + // conditional case. + deviceMalloc(&d_input_ids, request_batch_size * max_input_len, false); + deviceMalloc(&d_input_lengths, request_batch_size, false); + cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len); + cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size); + } + std::vector start_ids(request_batch_size, start_id); + std::vector end_ids(request_batch_size, end_id); + + // Prompt Learning Configurations + // NOTE: if you don't need prefix prompts, remember to set max_prefix_len to 0 and others to nullptr + int prompt_learning_start_id = reader.GetInteger(model_name, "prompt_learning_start_id", end_id + 1); + fastertransformer::PromptLearningType prompt_learning_type = + static_cast(reader.GetInteger(model_name, "prompt_learning_type", 0)); + + // NOTE: specify task names, take name id, prompt length in order to load those prompt learning tables. + // NOTE: Please make sure task ids are continuous and start from 0 + // for example: + // std::map> prefix_prompt_table_pair{{"no_prompt", {0, 0}}, + // {"prompt_1", {1, 1}}, + // {"prompt_2", {2, 2}}, + // {"prompt_3", {3, 3}}, + // {"prompt_4", {4, 4}}, + // {"prompt_5", {5, 5}}}; + + std::map> prefix_prompt_table_pair; + + // NOTE: get prompt table pairs from configuration files + const int num_tasks = reader.GetInteger(model_name, "num_tasks", 0); + for (int task_name_id = 0; task_name_id < num_tasks; task_name_id++) { + std::string config_task_name = model_name + "_task_" + std::to_string(task_name_id); + std::string task_name = reader.Get(config_task_name, "task_name"); + const int prompt_length = reader.GetInteger(config_task_name, "prompt_length", 0); + prefix_prompt_table_pair.insert({task_name, {task_name_id, prompt_length}}); + } + + // NOTE: task_name_ids for each sequence in one batch + // Each sequence can have different prompt learning task ids + std::vector prefix_prompt_task_ids(request_batch_size, 0); + + // Set different task ids + for (int i = 0; i < request_batch_size; i++) { + prefix_prompt_task_ids[i] = (num_tasks > 0) ? i % num_tasks : 0; + } + + const int total_output_len = max_input_len + request_output_len; + + cudaStream_t stream; + cublasHandle_t cublas_handle; + cublasLtHandle_t cublaslt_handle; + cudaStreamCreate(&stream); + cublasCreate(&cublas_handle); + cublasLtCreate(&cublaslt_handle); + cublasSetStream(cublas_handle, stream); + cublasAlgoMap* cublas_algo_map = new cublasAlgoMap("gemm_config.in"); + + Allocator allocator(getDevice()); + + std::mutex* cublas_wrapper_mutex = new std::mutex(); + cublasMMWrapper cublas_wrapper = + cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, &allocator); + if (std::is_same::value) { + cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); + } + else if (std::is_same::value) { + cublas_wrapper.setFP32GemmConfig(); + } + + const bool use_gptj_residual = (bool)reader.GetInteger(model_name, "use_gptj_residual", 1); + fastertransformer::LlamaWeight gpt_weights(hidden_units, + inter_size, + vocab_size, + decoder_layers, + 0, // max_seq_len, deprecated + tensor_para.world_size_, + tensor_para.rank_, + pipeline_para.world_size_, + pipeline_para.rank_, + use_gptj_residual, + prompt_learning_type, + prefix_prompt_table_pair); + + model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu"; + gpt_weights.loadModel(model_dir); + unsigned long long random_seed; + if (rank == 0) { + random_seed = (unsigned long long)(0); + } + if (world_size > 1) { + mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD); + } + + AttentionType attention_type = getAttentionType(size_per_head, + getSMVersion(), + true, // remove_padding + 0, // gpt supports any-seq-length fmha + true, // is_fuse + false, // with_relative_position_bias + true); // causal_mask + + Llama gpt = Llama(head_num, + size_per_head, + inter_size, + decoder_layers, + vocab_size, + rotary_embedding_dim, + start_id, + end_id, + prompt_learning_start_id, + prompt_learning_type, + use_gptj_residual, + 0.0f, + top_k, + top_p, + random_seed, + temperature, + len_penalty, + repetition_penalty, + tensor_para, + pipeline_para, + stream, + &cublas_wrapper, + &allocator, + false, + &prop, + attention_type); + + int* d_output_ids; + int* d_sequence_lengths; + deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false); + deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false); + std::vector output_seq_len(request_batch_size, total_output_len); + std::unordered_map input_tensors = std::unordered_map{ + {"input_ids", + Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, (size_t)max_input_len}, d_input_ids}}, + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_input_lengths}}, + // NOTE: if you need prefix prompts, remember to add prefix_prompt_task_ids here + // {"prompt_learning_task_name_ids", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, + // prefix_prompt_task_ids.data()}}, + {"output_seq_len", + Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size}, output_seq_len.data()}}, + {"bad_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {2, bad_words.size() / 2}, d_bad_words}}, + {"stop_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {request_batch_size, 2, stop_words_len}, d_stop_words}}, + {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &temperature}}, + {"len_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &len_penalty}}, + {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, + {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, + {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}}; + + if (repetition_penalty != 1.0f) { + input_tensors.insert( + {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &repetition_penalty}}); + } + if (presence_penalty != 0.0f) { + input_tensors.insert( + {"presence_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &presence_penalty}}); + } + + if (num_tasks > 0) { + // Prefix Prompt Task Name Ids here + input_tensors.insert( + {"prompt_learning_task_name_ids", + Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, prefix_prompt_task_ids.data()}}); + } + + if (top_k == 0 && top_p == 0.0f) { + FT_CHECK(beam_width > 1); + input_tensors.insert({"beam_search_diversity_rate", + Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &beam_search_diversity_rate}}); + } + else { + input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); + if (top_p != 0.0f) { + input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &top_p}}); + } + if (top_k != 0) { + input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &top_k}}); + } + } + + std::unordered_map output_tensors = std::unordered_map{ + {"output_ids", + Tensor{MEMORY_GPU, + TYPE_INT32, + std::vector{request_batch_size, beam_width, (size_t)total_output_len}, + d_output_ids}}, + {"sequence_length", + Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, beam_width}, d_sequence_lengths}}, + {"output_log_probs", + Tensor{MEMORY_GPU, + TYPE_FP32, + std::vector{(size_t)request_output_len, request_batch_size, beam_width}, + nullptr}}}; + + print_mem_usage(); + + int ite = 1; + cudaDeviceSynchronize(); + mpi::barrier(); + + cudaProfilerStart(); + // warm up + ite = 1; + ft_nvtx::setScope("warmup_time"); + PUSH_RANGE("warmup time") + for (int i = 0; i < ite; ++i) { + gpt.forward(&output_tensors, &input_tensors, &gpt_weights); + } + cudaDeviceSynchronize(); + mpi::barrier(); + + POP_RANGE; + ft_nvtx::resetScope(); + + if (rank == 0) { + + std::string fName = "out"; + auto outFile = std::ofstream(fName, std::ios::out); + if (!outFile.is_open()) { + printf("[WARNING] Cannot write results into output file %s \n", fName.c_str()); + } + else { + size_t outCount = total_output_len * request_batch_size * beam_width; + int* hBuf = new int[outCount]; + cudaD2Hcpy(hBuf, d_output_ids, outCount); + + { + std::cout << "Writing " << outCount << " elements\n"; + int zeroCount = 0; + for (size_t i = 0; i < outCount; i++) { + if (hBuf[i] == int(0)) { + zeroCount++; + } + outFile << hBuf[i] << " "; + if ((i + 1) % (total_output_len) == 0) { + outFile << std::endl; + } + + if (i < 10) { + printf("%5d ", hBuf[i]); + } + if ((i + 1) % (total_output_len) == 0 && i < 10) { + std::cout << std::endl; + } + } + std::cout << std::endl << "zeroCount = " << zeroCount << std::endl; + } + delete[] hBuf; + } + } + + // test time + struct timeval start, end; + mpi::barrier(); + cudaDeviceSynchronize(); + gettimeofday(&start, NULL); + + ft_nvtx::setScope("total_time"); + PUSH_RANGE("total time") + for (int i = 0; i < ite; ++i) { + gpt.forward(&output_tensors, &input_tensors, &gpt_weights); + } + + cudaDeviceSynchronize(); + mpi::barrier(); + + POP_RANGE; + ft_nvtx::resetScope(); + gettimeofday(&end, NULL); + + cudaProfilerStop(); + + printf("[INFO] request_batch_size %ld beam_width %ld head_num %ld size_per_head %ld total_output_len %d" + " decoder_layers %ld vocab_size %ld FT-CPP-decoding-beamsearch-time %.2f ms\n", + request_batch_size, + beam_width, + head_num, + size_per_head, + total_output_len, + decoder_layers, + vocab_size, + ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite); + + ftNcclParamDestroy(tensor_para); + ftNcclParamDestroy(pipeline_para); + + delete cublas_algo_map; + delete cublas_wrapper_mutex; + + cudaFree(d_bad_words); + cudaFree(d_stop_words); + if (d_input_ids != nullptr) { + cudaFree(d_input_ids); + } + if (d_input_lengths != nullptr) { + cudaFree(d_input_lengths); + } + if (d_output_ids != nullptr) { + deviceFree(d_output_ids); + } + if (d_sequence_lengths != nullptr) { + deviceFree(d_sequence_lengths); + } + + return; +} diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc new file mode 100644 index 000000000..1840035a2 --- /dev/null +++ b/examples/cpp/llama/llama_triton_example.cc @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "3rdparty/INIReader.h" +#include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h" +#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" +#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" +#include "src/fastertransformer/utils/mpi_utils.h" +#include "src/fastertransformer/utils/nccl_utils.h" +#include "src/fastertransformer/utils/nvtx_utils.h" +#include "src/fastertransformer/utils/word_list.h" + +#include +#include + +namespace ft = fastertransformer; + +struct RequestParam { + int beam_width; + int request_output_len; + float beam_search_diversity_rate; + uint runtime_top_k; + float runtime_top_p; + float temperature; + float len_penalty; + float repetition_penalty; + float presence_penalty; + int min_length; + unsigned long long int random_seed; + int start_id; + int end_id; +}; + +std::vector>> +broadCastRequest(const std::vector& v_start_ids, + const std::vector& v_start_lengths, + const std::vector& v_bad_words, + const int node_id, + const int gpu_count, + const RequestParam param, + std::vector* pointer_record) +{ + // broadcast the request to all nodes, and copy "gpu_count" copies on different gpu + int size_1 = v_start_ids.size(); + int size_2 = v_start_lengths.size(); + int size_bad_words = v_bad_words.size(); + ft::mpi::bcast(&size_1, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); + ft::mpi::bcast(&size_2, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); + ft::mpi::bcast(&size_bad_words, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); + + std::vector v_input_ids(size_1); + std::vector v_input_lengths(size_2); + std::vector v_input_bad_words(size_bad_words); + + if (node_id == 0) { + memcpy(v_input_ids.data(), v_start_ids.data(), size_1 * sizeof(int)); + memcpy(v_input_lengths.data(), v_start_lengths.data(), size_2 * sizeof(int)); + memcpy(v_input_bad_words.data(), v_bad_words.data(), size_bad_words * sizeof(int)); + } + ft::mpi::barrier(); + + int request_batch_size = size_2; + int max_input_len = size_1 / size_2; + + ft::mpi::bcast(v_input_ids.data(), size_1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); + ft::mpi::bcast(v_input_lengths.data(), size_2, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); + ft::mpi::bcast(v_input_bad_words.data(), size_bad_words, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD); + + std::vector>> request_list; + for (int device_id = 0; device_id < gpu_count; device_id++) { + ft::check_cuda_error(cudaSetDevice(device_id)); + + int* d_input_ids; + int* d_input_lengths; + int* d_input_bad_words; + + if (max_input_len == 0) { + // unconditional case, no input ids, so do nothing. + d_input_ids = nullptr; + d_input_lengths = nullptr; + max_input_len = 0; + } + else { + // conditional case. + ft::deviceMalloc(&d_input_ids, size_1, false); + ft::deviceMalloc(&d_input_lengths, size_2, false); + ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1); + ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2); + } + ft::deviceMalloc(&d_input_bad_words, size_bad_words, false); + ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words); + + uint32_t* request_output_len_ptr = (uint32_t*)malloc(request_batch_size * sizeof(uint32_t)); + for (int i = 0; i < request_batch_size; i++) { + request_output_len_ptr[i] = param.request_output_len; + } + + int* start_ids_ptr = (int*)malloc(request_batch_size * sizeof(int)); + int* end_ids_ptr = (int*)malloc(request_batch_size * sizeof(int)); + for (int i = 0; i < request_batch_size; i++) { + start_ids_ptr[i] = param.start_id; + end_ids_ptr[i] = param.end_id; + } + pointer_record->push_back(start_ids_ptr); + pointer_record->push_back(end_ids_ptr); + + std::shared_ptr> input_tensors( + new std::unordered_map{ + {"input_ids", + triton::Tensor{triton::MEMORY_GPU, + triton::TYPE_INT32, + std::vector{(size_t)request_batch_size, (size_t)max_input_len}, + d_input_ids}}, + {"input_lengths", + triton::Tensor{triton::MEMORY_GPU, + triton::TYPE_INT32, + std::vector{(size_t)request_batch_size}, + d_input_lengths}}, + // NOTE: add prefix prompt task ids here if you need + // {"prefix_prompt_task_ids", triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, + // std::vector{request_batch_size}, task_name_ids}}, + {"request_output_len", + triton::Tensor{triton::MEMORY_CPU, + triton::TYPE_UINT32, + std::vector{(size_t)request_batch_size}, + request_output_len_ptr}}, + {"start_id", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, start_ids_ptr}}, + {"end_id", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}}); + if (!v_input_bad_words.empty()) { + input_tensors->insert( + {"bad_words_list", + triton::Tensor{ + triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}}); + } + request_list.push_back(input_tensors); + + int* beam_width_ptr = new int(param.beam_width); + pointer_record->push_back(beam_width_ptr); + request_list[device_id]->insert( + {"beam_width", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, beam_width_ptr}}); + if (param.beam_width > 1) { + float* beam_search_diversity_rate_ptr = new float(param.beam_search_diversity_rate); + pointer_record->push_back(beam_search_diversity_rate_ptr); + request_list[device_id]->insert( + {"beam_search_diversity_rate", + triton::Tensor{ + triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, beam_search_diversity_rate_ptr}}); + } + else { + if (param.runtime_top_p != 0.0f) { + float* runtime_top_p_ptr = new float(param.runtime_top_p); + pointer_record->push_back(runtime_top_p_ptr); + request_list[device_id]->insert( + {"runtime_top_p", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, runtime_top_p_ptr}}); + } + if (param.runtime_top_k != 0) { + uint* runtime_top_k_ptr = new uint(param.runtime_top_k); + pointer_record->push_back(runtime_top_k_ptr); + request_list[device_id]->insert( + {"runtime_top_k", + triton::Tensor{ + triton::MEMORY_CPU, triton::TYPE_UINT32, std::vector{1}, runtime_top_k_ptr}}); + } + } + float* temperature_ptr = new float(param.temperature); + pointer_record->push_back(temperature_ptr); + request_list[device_id]->insert( + {"temperature", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, temperature_ptr}}); + float* len_penalty_ptr = new float(param.len_penalty); + pointer_record->push_back(len_penalty_ptr); + request_list[device_id]->insert( + {"len_penalty", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, len_penalty_ptr}}); + if (param.repetition_penalty != 1.0f) { + float* repetition_penalty_ptr = new float(param.repetition_penalty); + pointer_record->push_back(repetition_penalty_ptr); + request_list[device_id]->insert( + {"repetition_penalty", + triton::Tensor{ + triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, repetition_penalty_ptr}}); + } + if (param.presence_penalty != 0.0f) { + float* presence_penalty_ptr = new float(param.presence_penalty); + pointer_record->push_back(presence_penalty_ptr); + request_list[device_id]->insert( + {"presence_penalty", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector{1}, presence_penalty_ptr}}); + } + int* min_length_ptr = new int(param.min_length); + pointer_record->push_back(min_length_ptr); + request_list[device_id]->insert( + {"min_length", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, min_length_ptr}}); + unsigned long long int* random_seed_ptr = new unsigned long long int(param.random_seed); + pointer_record->push_back(random_seed_ptr); + request_list[device_id]->insert( + {"random_seed", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_UINT64, std::vector{1}, random_seed_ptr}}); + + pointer_record->push_back(d_input_ids); + pointer_record->push_back(d_input_lengths); + pointer_record->push_back(d_input_bad_words); + pointer_record->push_back(request_output_len_ptr); + } + + return request_list; +} + +std::vector>> +prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector* pointer_record) +{ + INIReader reader = INIReader(ini_name); + if (reader.ParseError() < 0) { + std::cout << "[ERROR] Can't load '" << ini_name << "'\n"; + ft::FT_CHECK(false); + } + + const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); + const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name"); + const int start_id = reader.GetInteger(model_name, "start_id"); + const int end_id = reader.GetInteger(model_name, "end_id"); + + std::vector v_start_ids; + std::vector v_start_lengths; + + size_t max_input_len = 0; + ft::read_start_ids(request_batch_size, + &v_start_lengths, + &v_start_ids, + max_input_len, + end_id, + 1, + "../examples/cpp/gptj/start_ids.csv"); + + std::vector v_bad_words; + ft::read_word_list("../examples/cpp/gptj/bad_words.csv", v_bad_words); + + RequestParam param; + param.beam_width = reader.GetInteger("ft_instance_hyperparameter", "beam_width"); + param.request_output_len = reader.GetInteger("request", "request_output_len"); + param.beam_search_diversity_rate = reader.GetFloat("ft_instance_hyperparameter", "beam_search_diversity_rate"); + param.runtime_top_k = (uint)reader.GetInteger("ft_instance_hyperparameter", "top_k"); + param.runtime_top_p = reader.GetFloat("ft_instance_hyperparameter", "top_p"); + param.temperature = reader.GetFloat("ft_instance_hyperparameter", "temperature"); + param.len_penalty = reader.GetFloat("ft_instance_hyperparameter", "len_penalty"); + param.repetition_penalty = reader.GetFloat("ft_instance_hyperparameter", "repetition_penalty", 1.0f); + param.presence_penalty = reader.GetFloat("ft_instance_hyperparameter", "presence_penalty", 0.0f); + param.min_length = reader.GetInteger("ft_instance_hyperparameter", "min_length", 0); + param.random_seed = (unsigned long long int)0; + param.start_id = start_id; + param.end_id = end_id; + + auto request_list = + broadCastRequest(v_start_ids, v_start_lengths, v_bad_words, node_id, gpu_count, param, pointer_record); + return request_list; +} + +int threadCreateModelInstances(std::shared_ptr model, + std::vector>* model_instances, + const int device_id, + const int rank, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm = nullptr) +{ + printf("[INFO] rank = %d \n", rank); + ft::check_cuda_error(cudaSetDevice(device_id)); + cudaStream_t stream; + ft::check_cuda_error(cudaStreamCreate(&stream)); + model->createSharedWeights(device_id, rank); + auto model_instance = model->createModelInstance(device_id, rank, stream, nccl_params, custom_all_reduce_comm); + model_instances->at(device_id) = std::move(model_instance); + printf("model instance %d is created \n", device_id); + ft::print_mem_usage(); + return 0; +} + +int threadForward(std::unique_ptr* model_instance, + std::shared_ptr> request, + std::shared_ptr>* output_tensors, + const int device_id) +{ + ft::check_cuda_error(cudaSetDevice(device_id)); + *output_tensors = (*model_instance)->forward(request); + return 0; +} + +int main(int argc, char* argv[]) +{ + /* + Prepare the nccl ids, node id, device id and world size + by MPI or triton + */ + + ft::mpi::initialize(&argc, &argv); + int node_id = ft::mpi::getCommWorldRank(); + int node_num = ft::mpi::getCommWorldSize(); + + // Note: Only supports that all nodes have same gpu count + const int gpu_count = ft::getDeviceCount(); + const int world_size = node_num * gpu_count; + std::string ini_name = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/gptj/gptj_config.ini"; + + // step 1: Create model + std::shared_ptr model = AbstractTransformerModel::createGptJModel(ini_name); + int tensor_para_size = model->getTensorParaSize(); + int pipeline_para_size = model->getPipelineParaSize(); + FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size), + "World Size != Tensor Parallel Size * Pipeline Parallel Size !"); + + std::cout << model->toString(); + + // step 2: Initialize the NCCL + std::pair, std::vector> nccl_params = model->createNcclParams(node_id); + cudaDeviceSynchronize(); + + // Optional Step: create custom all reduce comm + std::vector> custom_all_reduce_comms; + model->createCustomComms(&custom_all_reduce_comms, world_size); + + // step 3: Create model instances + std::vector> model_instances((size_t)gpu_count); + std::vector threads; + for (int device_id = 0; device_id < gpu_count; device_id++) { + const int rank = node_id * gpu_count + device_id; + threads.push_back(std::thread(threadCreateModelInstances, + model, + &model_instances, + device_id, + rank, + nccl_params, + custom_all_reduce_comms[rank])); + } + for (auto& t : threads) { + t.join(); + } + + // step 4: prepare request + std::vector pointer_record; // Used to prevent the pointers are release after leaving functions + std::vector>> request_list = + prepareRequest(ini_name, node_id, gpu_count, &pointer_record); + printf("[INFO] request is created \n"); + + // step 5: Forward + std::vector>> output_tensors_lists( + (size_t)gpu_count); + for (int i = 0; i < 2; i++) { + threads.clear(); + for (int device_id = 0; device_id < gpu_count; device_id++) { + threads.push_back(std::thread(threadForward, + &model_instances[device_id], + request_list[device_id], + &output_tensors_lists[device_id], + device_id)); + } + for (auto& t : threads) { + t.join(); + } + } + printf("[INFO] forward is completed. \n"); + + const int* d_output_ids = (const int*)output_tensors_lists[0].get()->at("output_ids").data; + const int batch_size = output_tensors_lists[0].get()->at("output_ids").shape[0]; + const int beam_width = output_tensors_lists[0].get()->at("output_ids").shape[1]; + const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2]; + // step 6: check results + if (node_id == 0) { + + std::string fName = "out"; + auto outFile = std::ofstream(fName, std::ios::out); + if (!outFile.is_open()) { + printf("[WARNING] Cannot write results into output file %s \n", fName.c_str()); + } + else { + size_t outCount = batch_size * beam_width * seq_len; + int* hBuf = new int[outCount]; + ft::cudaD2Hcpy(hBuf, d_output_ids, outCount); + + { + std::cout << "Writing " << outCount << " elements\n"; + int zeroCount = 0; + for (size_t i = 0; i < outCount; i++) { + if (hBuf[i] == int(0)) { + zeroCount++; + } + outFile << hBuf[i] << " "; + if ((i + 1) % (seq_len) == 0) { + outFile << std::endl; + } + + if (i < 10) { + printf("%5d ", hBuf[i]); + } + if ((i + 1) % (seq_len) == 0 && i < 10) { + std::cout << std::endl; + } + } + std::cout << std::endl << "zeroCount = " << zeroCount << std::endl; + } + delete[] hBuf; + } + } + + // test time + struct timeval start, end; + ft::mpi::barrier(); + cudaDeviceSynchronize(); + gettimeofday(&start, NULL); + + const int ite = 1; + for (int i = 0; i < ite; i++) { + threads.clear(); + for (int device_id = 0; device_id < gpu_count; device_id++) { + threads.push_back(std::thread(threadForward, + &model_instances[device_id], + request_list[device_id], + &output_tensors_lists[device_id], + device_id)); + } + for (auto& t : threads) { + t.join(); + } + } + + cudaDeviceSynchronize(); + ft::mpi::barrier(); + + gettimeofday(&end, NULL); + + printf("[INFO] batch_size %d beam_width %d seq_len %d" + " FT-CPP-GPT-Triton-time %.2f ms\n", + batch_size, + beam_width, + seq_len, + ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite); + + ft::mpi::finalize(); + return 0; +} diff --git a/examples/cpp/llama/model_config.json b/examples/cpp/llama/model_config.json new file mode 100644 index 000000000..70266f26b --- /dev/null +++ b/examples/cpp/llama/model_config.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "max_position_embeddings": 2048, "hidden_size": 4096, "intermediate_size": 11008, "num_hidden_layers": 32, "num_attention_heads": 32, "hidden_act": "silu", "initializer_range": 0.02, "rms_norm_eps": 1e-06, "use_cache": True, "return_dict": True, "output_hidden_states": False, "output_attentions": False, "torchscript": False, "torch_dtype": torch.float16, "use_bfloat16": False, "tf_legacy_loss": False, "pruned_heads": {}, "tie_word_embeddings": False, "is_encoder_decoder": False, "is_decoder": False, "cross_attention_hidden_size": None, "add_cross_attention": False, "tie_encoder_decoder": False, "max_length": 20, "min_length": 0, "do_sample": False, "early_stopping": False, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": None, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": False, "return_dict_in_generate": False, "forced_bos_token_id": None, "forced_eos_token_id": None, "remove_invalid_values": False, "exponential_decay_length_penalty": None, "suppress_tokens": None, "begin_suppress_tokens": None, "architectures": ["LLaMAForCausalLM"], "finetuning_task": None, "id2label": {0: "LABEL_0", 1: "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": None, "prefix": None, "bos_token_id": 0, "pad_token_id": -1, "eos_token_id": 1, "sep_token_id": None, "decoder_start_token_id": None, "task_specific_params": None, "problem_type": None, "_name_or_path": "/data/llama-7b-hf/", "_commit_hash": None, "transformers_version": "4.27.0.dev0", "max_sequence_length": 2048, "model_type": "llama"} diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv new file mode 100644 index 000000000..35ddd915d --- /dev/null +++ b/examples/cpp/llama/start_ids.csv @@ -0,0 +1,8 @@ +0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv new file mode 100644 index 000000000..9b9b09eba --- /dev/null +++ b/examples/cpp/llama/stop_words.csv @@ -0,0 +1,2 @@ +287, 4346, 12 +3, -1, -1 diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt index 248b4af3d..d55782717 100644 --- a/src/fastertransformer/models/CMakeLists.txt +++ b/src/fastertransformer/models/CMakeLists.txt @@ -37,3 +37,5 @@ add_subdirectory(vit) add_subdirectory(vit_int8) add_subdirectory(wenet) + +add_subdirectory(llama) diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt new file mode 100644 index 000000000..ec836068d --- /dev/null +++ b/src/fastertransformer/models/llama/CMakeLists.txt @@ -0,0 +1,69 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.8) + +add_library(LlamaDecoderLayerWeight STATIC LlamaDecoderLayerWeight.cc) +set_property(TARGET LlamaDecoderLayerWeight PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LlamaDecoderLayerWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LlamaDecoderLayerWeight PUBLIC memory_utils cuda_utils logger) + +add_library(LlamaDecoder STATIC LlamaDecoder.cc) +set_property(TARGET LlamaDecoder PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LlamaDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LlamaDecoder PUBLIC -lcudart cublasMMWrapper + TensorParallelDecoderSelfAttentionLayer + TensorParallelGeluFfnLayer + layernorm_kernels + add_residual_kernels + LlamaDecoderLayerWeight + tensor + nccl_utils + cuda_utils + logger) + +add_library(LlamaContextDecoder STATIC LlamaContextDecoder.cc) +set_property(TARGET LlamaContextDecoder PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LlamaContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LlamaContextDecoder PUBLIC -lcudart cublasMMWrapper + TensorParallelGptContextAttentionLayer + TensorParallelGeluFfnLayer + layernorm_kernels + add_residual_kernels + gpt_kernels + tensor + nccl_utils + cuda_utils + logger) + +add_library(LlamaWeight STATIC LlamaWeight.cc) +set_property(TARGET LlamaWeight PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LlamaWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LlamaWeight PUBLIC LlamaDecoderLayerWeight cuda_utils logger) + +add_library(Llama STATIC Llama.cc) +set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(Llama PUBLIC -lcudart + LlamaDecoder + LlamaContextDecoder + decoding_kernels + gpt_kernels + DynamicDecodeLayer + BaseBeamSearchLayer + bert_preprocess_kernels + tensor + LlamaWeight + cuda_utils + logger) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc new file mode 100644 index 000000000..0091fce48 --- /dev/null +++ b/src/fastertransformer/models/llama/Llama.cc @@ -0,0 +1,1211 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/Llama.h" +#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" +#include "src/fastertransformer/kernels/decoding_kernels.h" +#include "src/fastertransformer/kernels/gpt_kernels.h" +#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" +#include + +namespace fastertransformer { + +template +void Llama::initialize() +{ + gpt_context_decoder_ = new LlamaContextDecoder(head_num_, + size_per_head_, + inter_size_, + num_layer_, + rotary_embedding_dim_, + neox_rotary_style_, + use_gptj_residual_, + layernorm_eps_, + tensor_para_, + pipeline_para_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + is_context_qk_buf_float_, + attention_type_, + custom_all_reduce_comm_, + enable_custom_all_reduce_); + + gpt_decoder_ = new LlamaDecoder(head_num_, + size_per_head_, + inter_size_, + num_layer_, + rotary_embedding_dim_, + neox_rotary_style_, + use_gptj_residual_, + layernorm_eps_, + tensor_para_, + pipeline_para_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + custom_all_reduce_comm_, + enable_custom_all_reduce_); + + dynamic_decode_layer_ = new DynamicDecodeLayer(vocab_size_, + vocab_size_padded_, + 0, // end_id, deprecated + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + cuda_device_prop_); +} + +template +void Llama::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void Llama::allocateBuffer( + size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + const size_t batchxbeam = batch_size * beam_width; + const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len + * hidden_units_ / tensor_para_.world_size_; + + if (vocab_size_ != vocab_size_padded_) { + padded_embedding_kernel_ = + (T*)(allocator_->reMalloc(padded_embedding_kernel_, sizeof(T) * hidden_units_ * vocab_size_padded_, true)); + padded_embedding_kernel_ptr_ = padded_embedding_kernel_; + + padded_embedding_bias_ = + (T*)(allocator_->reMalloc(padded_embedding_bias_, sizeof(T) * vocab_size_padded_, true)); + } + + input_attention_mask_ = (T*)(allocator_->reMalloc( + input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); + decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + decoder_output_buf_ = + (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + normed_decoder_output_buf_ = + (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); + logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); + nccl_logits_buf_ = + (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false)); + cum_log_probs_ = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false)); + finished_buf_ = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false)); + h_finished_buf_ = new bool[batchxbeam]; + sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); + + key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); + value_cache_ = key_cache_ + self_cache_size; + if (beam_width > 1) { + cache_indirections_[0] = + (int*)(allocator_->reMalloc(cache_indirections_[0], sizeof(int) * batchxbeam * max_seq_len * 2, true)); + cache_indirections_[1] = cache_indirections_[0] + batchxbeam * max_seq_len; + } + + // prompt_learning weight batch ptrs + prompt_learning_weight_batch_ = + (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false)); + tiled_prompt_lengths_buf_ = + (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false)); + + tiled_input_ids_buf_ = + (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true)); + tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true)); + tiled_total_padding_count_ = + (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false); + + transposed_output_ids_buf_ = + (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); + output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); + parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); + seq_limit_len_ = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false)); + masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true)); + + start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false)); + end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false)); + + context_decoder_input_buf_ = (T*)(allocator_->reMalloc( + context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); + context_decoder_output_buf_ = (T*)(allocator_->reMalloc( + context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false)); + output_log_probs_buf_ = + (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false)); + + generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true); + + is_allocate_buffer_ = true; +} + +template +void Llama::freeBuffer() +{ + if (is_allocate_buffer_) { + if (vocab_size_ != vocab_size_padded_) { + padded_embedding_kernel_ptr_ = nullptr; + allocator_->free((void**)(&padded_embedding_kernel_)); + allocator_->free((void**)(&padded_embedding_bias_)); + } + + allocator_->free((void**)(&input_attention_mask_)); + allocator_->free((void**)(&decoder_input_buf_)); + allocator_->free((void**)(&decoder_output_buf_)); + allocator_->free((void**)(&normed_decoder_output_buf_)); + allocator_->free((void**)(&logits_buf_)); + allocator_->free((void**)(&nccl_logits_buf_)); + allocator_->free((void**)(&cum_log_probs_)); + allocator_->free((void**)(&finished_buf_)); + delete[] h_finished_buf_; + allocator_->free((void**)(&sequence_lengths_)); + + allocator_->free((void**)(&key_cache_)); + if (cache_indirections_[0] != nullptr) { + allocator_->free((void**)(&cache_indirections_)[0]); + } + + allocator_->free((void**)(&prompt_learning_weight_batch_)); + allocator_->free((void**)(&tiled_prompt_lengths_buf_)); + + allocator_->free((void**)(&tiled_input_ids_buf_)); + allocator_->free((void**)(&tiled_input_lengths_buf_)); + allocator_->free((void**)(&tiled_total_padding_count_)); + + allocator_->free((void**)(&transposed_output_ids_buf_)); + allocator_->free((void**)(&output_ids_buf_)); + allocator_->free((void**)(&parent_ids_buf_)); + allocator_->free((void**)(&seq_limit_len_)); + allocator_->free((void**)(&masked_tokens_)); + + allocator_->free((void**)(&start_ids_buf_)); + allocator_->free((void**)(&end_ids_buf_)); + + allocator_->free((void**)(&context_decoder_input_buf_)); + allocator_->free((void**)(&context_decoder_output_buf_)); + allocator_->free((void**)(&output_log_probs_buf_)); + + allocator_->free((void**)(&generation_should_stop_), true); + + is_allocate_buffer_ = false; + } +} + +template +Llama::Llama(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + int prompt_learning_start_id, // only needed by p/prompt-tuning + PromptLearningType prompt_learning_type, + bool use_gptj_residual, + float beam_search_diversity_rate, + size_t top_k, + float top_p, + unsigned long long random_seed, + float temperature, + float len_penalty, + float repetition_penalty, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + vocab_size_(vocab_size), + rotary_embedding_dim_(rotary_embedding_dim), + start_id_(start_id), + end_id_(end_id), + prompt_learning_start_id_(prompt_learning_start_id), + prompt_learning_type_(prompt_learning_type), + use_gptj_residual_(use_gptj_residual), + hidden_units_(head_num * size_per_head), + local_head_num_(head_num / 1), + attention_type_(attention_type) +{ + tensor_para_.world_size_ = 1; + tensor_para_.rank_ = 0; + pipeline_para_.world_size_ = 1; + pipeline_para_.rank_ = 0; + + int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); + if (std::is_same::value) { + local_vacab_size = ceil(local_vacab_size / 8.f) * 8; + } + vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_; + initialize(); +} + +template +Llama::Llama(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + int prompt_learning_start_id, // only needed by p/prompt-tuning + PromptLearningType prompt_learning_type, + bool use_gptj_residual, + float beam_search_diversity_rate, + size_t top_k, + float top_p, + unsigned long long random_seed, + float temperature, + float len_penalty, + float repetition_penalty, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + vocab_size_(vocab_size), + rotary_embedding_dim_(rotary_embedding_dim), + start_id_(start_id), + end_id_(end_id), + prompt_learning_start_id_(prompt_learning_start_id), + prompt_learning_type_(prompt_learning_type), + use_gptj_residual_(use_gptj_residual), + hidden_units_(head_num * size_per_head), + tensor_para_(tensor_para), + pipeline_para_(pipeline_para), + local_head_num_(head_num / tensor_para.world_size_), + custom_all_reduce_comm_(custom_all_reduce_comm), + enable_custom_all_reduce_(enable_custom_all_reduce), + attention_type_(attention_type) +{ + int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); + if (std::is_same::value) { + local_vacab_size = ceil(local_vacab_size / 8.f) * 8; + } + vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_; + initialize(); +} + +template +Llama::Llama(Llama const& gpt): + BaseLayer(gpt), + head_num_(gpt.head_num_), + size_per_head_(gpt.size_per_head_), + inter_size_(gpt.inter_size_), + num_layer_(gpt.num_layer_), + vocab_size_(gpt.vocab_size_), + rotary_embedding_dim_(gpt.rotary_embedding_dim_), + start_id_(gpt.start_id_), + end_id_(gpt.end_id_), + prompt_learning_start_id_(gpt.prompt_learning_start_id_), + prompt_learning_type_(gpt.prompt_learning_type_), + use_gptj_residual_(gpt.use_gptj_residual_), + hidden_units_(gpt.hidden_units_), + tensor_para_(gpt.tensor_para_), + pipeline_para_(gpt.pipeline_para_), + local_head_num_(gpt.local_head_num_), + vocab_size_padded_(gpt.vocab_size_padded_), + custom_all_reduce_comm_(gpt.custom_all_reduce_comm_), + enable_custom_all_reduce_(gpt.enable_custom_all_reduce_), + attention_type_(gpt.attention_type_) +{ + initialize(); +} + +template +Llama::~Llama() +{ + delete gpt_decoder_; + delete dynamic_decode_layer_; + delete gpt_context_decoder_; + freeBuffer(); +} + +template +void Llama::registerCallback(callback_sig* fn, void* ctx) +{ + token_generated_cb_ = fn; + token_generated_ctx_ = ctx; +} + +template +void Llama::unRegisterCallback() +{ + token_generated_cb_ = nullptr; + token_generated_ctx_ = nullptr; +} + +template +void Llama::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const LlamaWeight* gpt_weights) +{ + FT_CHECK(false); +} + +template +void Llama::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const LlamaWeight* gpt_weights) +{ + // input_tensors: + // input_ids [batch_size, max_input_length] + // input_lengths [batch_size] + // prompt_learning_task_name_ids [batch_size] on cpu, optional + // output_seq_len [batch_size] on cpu + // start_id [batch_size] on cpu, optional + // end_id [batch_size] on cpu, optional + // stop_words_list [batch_size, 2, stop_words_length], optional + // bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional + // runtime_top_k [1] or [batch_size] on cpu, optional, uint. + // runtime_top_p [1] or [batch_size] on cpu, optional, float. + // beam_search_diversity_rate [1] or [batch_size] on cpu, optional, float. + // temperature [1] or [batch_size] on cpu, optional, float. + // len_penalty [1] or [batch_size] on cpu, optional, float. + // repetition_penalty [1] or [batch_size] on cpu, optional, float. + // min_length [1] or [batch_size] on cpu, optional, int + // random_seed [1] or [batch_size] on cpu, optional, unsigned long long int. + // request_prompt_lengths [batch_size], optional + // request_prompt_embedding [batch_size, max_prompt_length, hidden_units], float, optional + // requst_prompt_type [batch_size], int, optional + // top_p_decay [batch_size] on gpu, float, optional + // top_p_min [batch_size] on gpu, float, optional + // top_p_reset_ids [batch_size] on gpu, uint32, optional + + // output_tensors: + // output_ids [batch_size, beam_width, max_output_seq_len] + // sequence_length [batch_size, beam_width] + // output_log_probs [batch_size, beam_width, request_output_seq_len], must be float*. + // optional. It leads to additional computing cost. If we don't need this result, don't put it. + // cum_log_probs [batch_size, beam], optional, must be float*. + // optional. It leads to additional computing cost. If we don't need this result, don't put it. + + // Step is from max_input_length ~ max_output_seq_len, + // When step = k, we put output ids and caches at step k, and the sequence_length would be k - 1 before + // complete this step. + // When there is no input_ids, put the start token at step 0 of output_ids_buf_. After forward, only copy + // the step 1 ~ max_output_seq_len of output_ids_buf_ to output_tensors->at(0).data + + FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3"); + FT_CHECK_WITH_INFO(output_tensors->size() >= 2, "output_tensors->size() >= 2"); + FT_CHECK(input_tensors->at("input_ids").shape.size() == 2); + FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); + FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end() + && input_tensors->at("output_seq_len").shape.size() == 1); + FT_CHECK(output_tensors->at("output_ids").shape.size() == 3); + FT_CHECK(output_tensors->at("sequence_length").shape.size() == 2); + FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape[0] == output_tensors->at("output_ids").shape[0], + "input_tensors->at(\"input_ids\").shape[0] == output_tensors->at(\"output_ids\").shape[0]"); + + const size_t batch_size = output_tensors->at("output_ids").shape[0]; + const size_t beam_width = output_tensors->at("output_ids").shape[1]; + + PromptLearningType request_prompt_type = PromptLearningType::no_prompt; + int valid_prompt_inputs = input_tensors->count("request_prompt_type") + + input_tensors->count("request_prompt_lengths") + + input_tensors->count("request_prompt_embedding"); + + if (valid_prompt_inputs == 3) { + request_prompt_type = static_cast(input_tensors->at("request_prompt_type").getVal()); + FT_LOG_INFO("Apply prompt embedding from input, will ignore task name ids"); + } + else if (valid_prompt_inputs > 0) { + FT_LOG_WARNING( + "Prompts not applied: request_prompt_embedding, request_prompt_lengths, request_prompt_type are all needed!"); + } + if (request_prompt_type == PromptLearningType::prefix_prompt) { + FT_LOG_WARNING("Request prompt doesn't support prefix prompt currently!"); + } + + // Prefix Prompt Inputs + // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes + // pad) + // TODO (perkzz): move unnecessary paddings + const int* prompt_learning_task_name_ids = + input_tensors->count("prompt_learning_task_name_ids") ? + input_tensors->at("prompt_learning_task_name_ids").getPtr() : + nullptr; + has_prefix_prompt_ = + (prompt_learning_task_name_ids != nullptr) && (prompt_learning_type_ == PromptLearningType::prefix_prompt); + int max_prefix_prompt_length = 0; + + FT_CHECK_WITH_INFO( + !(prompt_learning_task_name_ids != nullptr + && (prompt_learning_type_ == PromptLearningType::no_prompt + || prompt_learning_type_ == PromptLearningType::soft_prompt)), + "prompt_learning_type is prefix_prompt either p_prompt_tuning when prompt_learning_task_name_ids are provided."); + + // NOTE: Prefix Prompt PreProcessing + // get prefix_prompt_weight for each batch --> shape [batch, beam_width] + // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] + std::vector prefix_prompt_weight_batch_ptrs; + std::vector prefix_prompt_lengths; + if (has_prefix_prompt_) { + for (int bs_id = 0; bs_id < batch_size; ++bs_id) { + int task_id = prompt_learning_task_name_ids[bs_id]; + // throw errors when prompt task_name_ids are not found + std::pair prefix_prompt_weight_length_pair; + try { + prefix_prompt_weight_length_pair = gpt_weights->prompt_learning_table.at(task_id); + } + catch (const std::out_of_range& oor) { + FT_LOG_ERROR("prefix_prompt_weights_lengths not found for prompt task id: " + task_id); + throw oor; + } + for (int bw_id = 0; bw_id < beam_width; ++bw_id) { + prefix_prompt_weight_batch_ptrs.push_back(prefix_prompt_weight_length_pair.first); + prefix_prompt_lengths.push_back(prefix_prompt_weight_length_pair.second); + } + } + + max_prefix_prompt_length = *max_element(prefix_prompt_lengths.begin(), prefix_prompt_lengths.end()); + + FT_LOG_DEBUG("max_prefix_prompt_length: %d", max_prefix_prompt_length); + + if (max_prefix_prompt_length == 0) { + has_prefix_prompt_ = false; + FT_LOG_DEBUG("prompts are not applied !"); + } + } + + int max_input_length = input_tensors->at("input_ids").shape[1]; + FT_CHECK_WITH_INFO(!(max_input_length == 0 && max_prefix_prompt_length > 0), + "Prefix Prompt should come with inputs!"); + + // Prefix Soft Prompt + has_prefix_soft_prompt_ = request_prompt_type == PromptLearningType::soft_prompt; + const size_t max_prefix_soft_prompt_length = + has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; + const size_t limit_len_offset = max_prefix_soft_prompt_length + (max_input_length == 0 ? 1 : 0); + const size_t max_output_seq_len = input_tensors->at("output_seq_len").max() + limit_len_offset; + const size_t max_seq_len = max_output_seq_len; + // max cache seq len should include max prefix prompt length as it has k/v states + const size_t max_cache_seq_len = max_output_seq_len + max_prefix_prompt_length; + if (max_cache_seq_len < max_seq_len) { + FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). " + "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.", + max_cache_seq_len, + max_seq_len); + } + else if (max_cache_seq_len > max_seq_len) { + FT_LOG_WARNING("max_cache_seq_len (%d) is larger than max_seq_len (%d). " + "This may lead to additional memory cost. Suggest to use smaller max_cache_seq_len.", + max_cache_seq_len, + max_seq_len); + } + const cudaDataType_t gemm_data_type = getCudaDataType(); + allocateBuffer( + batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + max_prefix_soft_prompt_length); + setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size); + + sync_check_cuda_error(); + { + TensorMap input_map(*input_tensors); + dynamic_decode_layer_->setup(batch_size, beam_width, &input_map); + handleOptArg(&input_map, "start_id", start_ids_buf_, start_id_, batch_size); + handleOptArg(&input_map, "end_id", end_ids_buf_, end_id_, batch_size); + } + + const DataType data_type = getTensorType(); + + const std::vector self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_, + batch_size * beam_width, + local_head_num_, + size_per_head_ / (16 / sizeof(T)), + max_cache_seq_len, + 16 / sizeof(T)}; + const std::vector self_v_cache_shape = {num_layer_ / pipeline_para_.world_size_, + batch_size * beam_width, + local_head_num_, + max_cache_seq_len, + size_per_head_}; + + // initialize the output ids and parent ids + cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_); + cudaMemsetAsync(parent_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_); + cudaMemsetAsync(masked_tokens_, false, sizeof(bool) * batch_size * beam_width * max_cache_seq_len, stream_); + cudaMemsetAsync(tiled_total_padding_count_, 0, sizeof(int) * batch_size * beam_width, stream_); + if (beam_width > 1) { + cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_); + } + + // Prefix prompts + if (has_prefix_prompt_) { + cudaMemcpyAsync(prompt_learning_weight_batch_, + prefix_prompt_weight_batch_ptrs.data(), + sizeof(T*) * batch_size * beam_width, + cudaMemcpyDefault, + stream_); + cudaMemcpyAsync(tiled_prompt_lengths_buf_, + prefix_prompt_lengths.data(), + sizeof(int) * batch_size * beam_width, + cudaMemcpyDefault, + stream_); + } + + sync_check_cuda_error(); + + // handle first step + if (has_prefix_prompt_ || has_prefix_soft_prompt_ || max_input_length > 1) { + invokeTileGptInputs(tiled_input_ids_buf_, + tiled_input_lengths_buf_, + input_tensors->at("input_ids").getPtr(), + input_tensors->at("input_lengths").getPtr(), + batch_size, + beam_width, + max_input_length, + stream_); + sync_check_cuda_error(); + + if (has_prefix_soft_prompt_) { + inputIdsEmbeddingLookupPosEncodingSoftPromptParam param; + param.from_tensor = context_decoder_input_buf_; + param.output_ids = output_ids_buf_; + param.input_lengths = tiled_input_lengths_buf_; + param.embedding_table = gpt_weights->pre_decoder_embedding_table; + param.pos_table = gpt_weights->position_encoding_table; + param.prefix_soft_prompt_embedding = input_tensors->at("request_prompt_embedding").getPtr(); + param.prefix_soft_prompt_lengths = input_tensors->at("request_prompt_lengths").getPtr(); + param.input_ids = tiled_input_ids_buf_; + param.start_step = 1; + param.max_input_length = max_input_length; + param.max_prefix_soft_prompt_length = max_prefix_soft_prompt_length; + param.batch_size = batch_size; + param.beam_width = beam_width; + param.hidden_units = hidden_units_; + param.stream = stream_; + + invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(param); + sync_check_cuda_error(); + max_input_length += max_prefix_soft_prompt_length; // view soft_prompt as input + } + else { + invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_, + output_ids_buf_, + gpt_weights->pre_decoder_embedding_table, + gpt_weights->position_encoding_table, + pPromptTuningParam{}, // no p/prompt tuning + tiled_input_ids_buf_, + 1, + max_input_length, + max_input_length, + batch_size * beam_width, + hidden_units_, + stream_); + sync_check_cuda_error(); + } + + invokeBuildDecoderAttentionMask(input_attention_mask_, + tiled_input_lengths_buf_, + tiled_prompt_lengths_buf_, + batch_size * beam_width, + max_input_length, + max_prefix_prompt_length, + stream_); + sync_check_cuda_error(); + + std::unordered_map decoder_input_tensors{ + {"decoder_input", + Tensor{MEMORY_GPU, + data_type, + {batch_size * beam_width, (size_t)max_input_length, hidden_units_}, + context_decoder_input_buf_}}, + {"attention_mask", + Tensor{MEMORY_GPU, + data_type, + {batch_size * beam_width, + 1, + (size_t)max_input_length, + (size_t)(max_input_length + max_prefix_prompt_length)}, + input_attention_mask_}}, + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}}, + {"d_prefix_prompt_batch", + Tensor{MEMORY_GPU, + data_type, + {batch_size * beam_width}, + has_prefix_prompt_ ? prompt_learning_weight_batch_ : nullptr}}, + {"d_prefix_prompt_lengths", + Tensor{MEMORY_GPU, + TYPE_INT32, + {batch_size * beam_width}, + has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}}; + + std::unordered_map decoder_output_tensors{ + {"decoder_output", + Tensor{MEMORY_GPU, + data_type, + {batch_size * beam_width, (size_t)max_input_length, hidden_units_}, + context_decoder_output_buf_}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}, + {"last_token_hidden_units", + Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}}; + + gpt_context_decoder_->forward( + &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights); + sync_check_cuda_error(); + invokeDecodingInitialize(finished_buf_, + sequence_lengths_, + nullptr, + cum_log_probs_, + start_ids_buf_, + batch_size, + beam_width, + max_input_length - 1, + stream_); + sync_check_cuda_error(); + } + else if (max_input_length == 0) { + FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt + && request_prompt_type == PromptLearningType::no_prompt); // Not support prompts in this case + max_input_length++; + invokeDecodingInitialize(finished_buf_, + sequence_lengths_, + output_ids_buf_, + cum_log_probs_, + start_ids_buf_, + batch_size, + beam_width, + max_input_length - 1, + stream_); + std::vector h_input_lengths(batch_size * beam_width, 1); + cudaMemcpyAsync(tiled_input_lengths_buf_, + h_input_lengths.data(), + sizeof(int) * batch_size * beam_width, + cudaMemcpyHostToDevice, + stream_); + sync_check_cuda_error(); + } + else if (max_input_length == 1) { + FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt + && request_prompt_type == PromptLearningType::no_prompt); // Not support prompts in this case + invokeDecodingInitialize(finished_buf_, + sequence_lengths_, + nullptr, + cum_log_probs_, + start_ids_buf_, + batch_size, + beam_width, + max_input_length - 1, + stream_); + sync_check_cuda_error(); + invokeTileGptInputs(tiled_input_ids_buf_, + tiled_input_lengths_buf_, + input_tensors->at("input_ids").getPtr(), + input_tensors->at("input_lengths").getPtr(), + batch_size, + beam_width, + max_input_length, + stream_); + sync_check_cuda_error(); + + cudaMemcpyAsync(output_ids_buf_, + tiled_input_ids_buf_, + sizeof(int) * batch_size * beam_width, + cudaMemcpyDeviceToDevice, + stream_); + } + + if (vocab_size_ == vocab_size_padded_) { + padded_embedding_kernel_ptr_ = gpt_weights->post_decoder_embedding.kernel; + } + else { + cudaMemcpyAsync(padded_embedding_kernel_, + gpt_weights->post_decoder_embedding.kernel, + sizeof(T) * vocab_size_ * hidden_units_, + cudaMemcpyDeviceToDevice, + stream_); + cudaMemcpyAsync(padded_embedding_bias_, + gpt_weights->post_decoder_embedding.bias, + sizeof(T) * vocab_size_, + cudaMemcpyDeviceToDevice, + stream_); + sync_check_cuda_error(); + } + + invokeMaskPaddingTokens(masked_tokens_, + input_tensors->at("input_lengths").getPtr(), // not_tiled + tiled_prompt_lengths_buf_, + max_cache_seq_len, + max_input_length + max_prefix_prompt_length, + 0, + batch_size, + beam_width, + stream_); + + for (int step = max_input_length; step < (int)max_output_seq_len; step++) { + const int src_indir_idx = (step - max_input_length) % 2; + const int tgt_indir_idx = 1 - src_indir_idx; + + const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_); + FT_CHECK(batch_size % local_batch_size == 0); + const size_t iteration_num = batch_size / local_batch_size; + *generation_should_stop_ = true; + + for (uint ite = 0; ite < iteration_num; ++ite) { + const int id_offset = ite * local_batch_size * beam_width; + const int hidden_units_offset = id_offset * hidden_units_; + const int vocab_size_units_offset = id_offset * vocab_size_padded_; + + if (!(max_input_length > 1 && step == max_input_length)) { + if (pipeline_para_.rank_ == 0) { + invokeEmbeddingLookupPosEncodingPadCount(decoder_input_buf_ + hidden_units_offset, + gpt_weights->pre_decoder_embedding_table, + gpt_weights->position_encoding_table, + output_ids_buf_ + id_offset, + tiled_total_padding_count_ + id_offset, + local_batch_size * beam_width, + hidden_units_, + (T)(1.0f), + step - 1, + batch_size * beam_width, + 0, + stream_); + sync_check_cuda_error(); + } + std::unordered_map decoder_input_tensors{ + {"decoder_input", + Tensor{MEMORY_GPU, + data_type, + {local_batch_size * beam_width, hidden_units_}, + decoder_input_buf_ + hidden_units_offset}}, + {"finished", + Tensor{MEMORY_GPU, TYPE_BOOL, {local_batch_size * beam_width}, finished_buf_ + id_offset}}, + {"sequence_lengths", + Tensor{MEMORY_GPU, TYPE_INT32, {local_batch_size * beam_width}, sequence_lengths_ + id_offset}}, + {"total_padding_tokens", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size * beam_width}, + tiled_total_padding_count_ + id_offset}}, + {"d_prefix_prompt_lengths", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size}, + has_prefix_prompt_ ? (tiled_prompt_lengths_buf_ + id_offset) : nullptr}}, + {"max_prefix_prompt_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_prefix_prompt_length}}, + {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}}, + {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, + {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}}, + {"cache_indirection", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size, beam_width, max_output_seq_len}, + beam_width > 1 ? cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len : + nullptr}}, + {"masked_tokens", + Tensor{MEMORY_GPU, + TYPE_BOOL, + {local_batch_size * beam_width, max_cache_seq_len}, + masked_tokens_ + id_offset * max_cache_seq_len}}}; + std::unordered_map decoder_output_tensors{ + {"decoder_output", + Tensor{MEMORY_GPU, + data_type, + {local_batch_size * beam_width, hidden_units_}, + decoder_output_buf_ + hidden_units_offset}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}}; + gpt_decoder_->forward( + &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights); + } + + if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { + invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset, + decoder_output_buf_ + hidden_units_offset, + gpt_weights->post_decoder_layernorm.gamma, + gpt_weights->post_decoder_layernorm.beta, + layernorm_eps_, + local_batch_size * beam_width, + hidden_units_, + (float*)nullptr, + 0, + stream_); + sync_check_cuda_error(); + + if (tensor_para_.world_size_ == 1) { + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->Gemm(CUBLAS_OP_T, + CUBLAS_OP_N, + vocab_size_padded_, // n + local_batch_size * beam_width, + hidden_units_, // k + &alpha, + padded_embedding_kernel_ptr_, + gemm_data_type, + hidden_units_, // k + normed_decoder_output_buf_ + hidden_units_offset, + gemm_data_type, + hidden_units_, // k + &beta, + logits_buf_ + vocab_size_units_offset, + CUDA_R_32F, + vocab_size_padded_, /* n */ + CUDA_R_32F, + cublasGemmAlgo_t(-1)); + } + else { + FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0); + const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_; + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->Gemm(CUBLAS_OP_T, + CUBLAS_OP_N, + local_vocab_size, // n + local_batch_size * beam_width, + hidden_units_, // k + &alpha, + padded_embedding_kernel_ptr_ + + tensor_para_.rank_ * local_vocab_size * hidden_units_, + gemm_data_type, + hidden_units_, // k + normed_decoder_output_buf_ + hidden_units_offset, + gemm_data_type, + hidden_units_, // k + &beta, + nccl_logits_buf_ + vocab_size_units_offset + + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size, + CUDA_R_32F, + local_vocab_size, /* n */ + CUDA_R_32F, + cublasGemmAlgo_t(-1)); + ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset, + nccl_logits_buf_ + vocab_size_units_offset, + local_batch_size * beam_width * local_vocab_size, + tensor_para_.rank_, + tensor_para_, + stream_); + invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset, + nccl_logits_buf_ + vocab_size_units_offset, + tensor_para_.world_size_, + local_batch_size * beam_width, + local_vocab_size, + stream_); + } + + int tmp_local_batch_size = local_batch_size; + bool is_initialize_random_table = step == max_input_length; + std::unordered_map dynamic_decode_input_tensors{ + {"logits", + Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_padded_}, logits_buf_}}, + // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_padded_}, nullptr}}, + {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, + {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}}, + {"input_lengths", + Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf_}}, + {"sequence_limit_length", Tensor{MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len_}}, + {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}}, + {"src_cache_indirection", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size, beam_width, max_output_seq_len}, + cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len}}, + {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &tmp_local_batch_size}}, + {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids_buf_}}, + {"is_initialize_random_table", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_initialize_random_table}}}; + + for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) { + if (dynamic_decode_input_tensors.find(t->first) == dynamic_decode_input_tensors.end()) { + dynamic_decode_input_tensors.insert(*t); + } + } + + // common outputs + bool subbatch_should_stop = false; + std::unordered_map dynamic_decode_output_tensors{ + {"output_ids", + Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids_buf_}}, + {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, finished_buf_}}, + // cum_log_probs is necessary for beam search, while it is optional for sampling. + {"cum_log_probs", + Tensor{MEMORY_GPU, + TYPE_FP32, + {batch_size * beam_width}, + ((beam_width > 1) || (output_tensors->count("cum_log_probs") > 0)) ? cum_log_probs_ : + nullptr}}, + {"output_log_probs", + Tensor{MEMORY_GPU, + TYPE_FP32, + {max_seq_len, batch_size, beam_width}, + output_tensors->count("output_log_probs") > 0 + && output_tensors->at("output_log_probs").data != nullptr ? + output_log_probs_buf_ : + nullptr}}, + {"parent_ids", + Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, parent_ids_buf_}}, + {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, sequence_lengths_}}, + {"tgt_cache_indirection", + Tensor{MEMORY_GPU, + TYPE_INT32, + {local_batch_size, beam_width, max_output_seq_len}, + cache_indirections_[tgt_indir_idx] + id_offset * max_output_seq_len}}, + {"should_stop", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &subbatch_should_stop}}}; + + for (auto t = output_tensors->begin(); t != output_tensors->end(); ++t) { + // Handle exceptions. + if (t->first == "cum_log_probs" || t->first == "output_log_probs") { + continue; + } + dynamic_decode_output_tensors.insert(*t); + } + + dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); + *generation_should_stop_ &= subbatch_should_stop; + } + } + + if (pipeline_para_.world_size_ > 1) { + ftNcclGroupStart(); + ftNcclBroadCast(output_ids_buf_ + step * batch_size * beam_width, + batch_size * beam_width, + pipeline_para_.world_size_ - 1, + pipeline_para_, + stream_); + + ftNcclBroadCast( + sequence_lengths_, batch_size * beam_width, pipeline_para_.world_size_ - 1, pipeline_para_, stream_); + + ftNcclBroadCast(generation_should_stop_, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_); + + if (beam_width > 1) { + ftNcclBroadCast(cache_indirections_[tgt_indir_idx], + batch_size * beam_width * max_output_seq_len, + pipeline_para_.world_size_ - 1, + pipeline_para_, + stream_); + } + ftNcclGroupEnd(); + // throw errors when detected + ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_); + sync_check_cuda_error(); + } + + if (*generation_should_stop_) { + break; + } + if (token_generated_cb_ && step + 1 < (int)max_output_seq_len) { + setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); + sendTensorsToFirstPipelineNode(output_tensors, input_tensors); + + if (pipeline_para_.rank_ == 0 && tensor_para_.rank_ == 0) { + token_generated_cb_(output_tensors, token_generated_ctx_); + } + } + if (step == max_input_length) { + /* We have just finished processing input: update the padding count: + * total_padding_count += (max_input_length - input_lengths) + * if has prefix prompts, += (max_prefix_prompt_length - prompt_length) + */ + invokeUpdatePaddingCount(tiled_total_padding_count_, + input_tensors->at("input_lengths").getPtr(), // not_tiled + has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : (const int*)nullptr, + max_input_length, + has_prefix_prompt_ ? max_prefix_prompt_length : 0, + batch_size, + beam_width, + stream_); + } + } + + setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); + sendTensorsToFirstPipelineNode(output_tensors, input_tensors); +} + +template +void Llama::sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + if (pipeline_para_.world_size_ == 1) { + // throw errors when detected + ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_); + return; + } + + const auto pp_rank = pipeline_para_.rank_; + + ftNcclGroupStart(); + for (auto const& it : *output_tensors) { + if (it.second.data == nullptr) { + continue; + } + + if (pp_rank == pipeline_para_.world_size_ - 1) { + ftNcclSend(it.second.getPtr(), it.second.sizeBytes(), 0, pipeline_para_, stream_); + } + else if (pp_rank == 0) { + ftNcclRecv(it.second.getPtr(), + it.second.sizeBytes(), + pipeline_para_.world_size_ - 1, + pipeline_para_, + stream_); + } + } + ftNcclGroupEnd(); + // throw errors when detected + ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_); +} + +template +void Llama::setOutputTensors(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const size_t max_input_length, + const size_t max_output_seq_len) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) { + return; + } + + const size_t batch_size = output_tensors->at("output_ids").shape[0]; + const size_t beam_width = output_tensors->at("output_ids").shape[1]; + uint* sequence_lengths = output_tensors->at("sequence_length").getPtr(); + const size_t max_prefix_soft_prompt_length = + has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; + + if (input_tensors->at("input_ids").shape[1] == 0) { + invokeCudaD2DcpyConvert( + sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); + // TODO: D2D sequence_lenghts + if (beam_width > 1) { + // For beam search, do gather_tree + // take output_parent_ids as inter buffer + invokeGatherTree(transposed_output_ids_buf_, + sequence_lengths_, + max_output_seq_len, + batch_size, + beam_width, + output_ids_buf_ + batch_size * beam_width, + parent_ids_buf_ + batch_size * beam_width, + end_ids_buf_, + stream_); + + // transpose and take output_parent_ids as inter buffer + invokeTransposeAxis01(output_tensors->at("output_ids").getPtr(), + transposed_output_ids_buf_, + max_output_seq_len - 1, + batch_size * beam_width, + 1, + stream_); + } + else { + // For sampling, only copy the results to output_tensor + invokeTransposeAxis01(output_tensors->at("output_ids").getPtr(), + output_ids_buf_ + batch_size * beam_width, + max_output_seq_len - 1, + batch_size * beam_width, + 1, + stream_); + } + } + else { + + // For sampling, it is equivalent to all parent ids are 0. + gatherTreeParam param; + param.beams = transposed_output_ids_buf_; + param.max_sequence_lengths = sequence_lengths_; + // add sequence_length 1 here because the sequence_length of time step t is t - 1 + param.max_sequence_length_final_step = 1; + param.max_time = max_output_seq_len; + param.batch_size = batch_size; + param.beam_width = beam_width; + param.step_ids = output_ids_buf_; + param.parent_ids = beam_width == 1 ? nullptr : parent_ids_buf_; + param.end_tokens = end_ids_buf_; + param.max_input_length = max_input_length; + param.prefix_soft_prompt_lengths = + has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_lengths").getPtr() : nullptr; + param.input_lengths = tiled_input_lengths_buf_; + param.max_prefix_soft_prompt_length = max_prefix_soft_prompt_length; + param.max_input_without_prompt_length = max_input_length; + param.stream = stream_; + param.output_ids = output_tensors->at("output_ids").getPtr(); + invokeGatherTree(param); + invokeCudaD2DcpyConvert( + sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); + sync_check_cuda_error(); + } + if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) { + invokeTransposeAxis01(output_tensors->at("output_log_probs").getPtr(), + output_log_probs_buf_, + input_tensors->at("output_seq_len").max() - max_input_length, + batch_size * beam_width, + 1, + stream_); + } + // Return the cumulative log probability if requested. + if (output_tensors->count("cum_log_probs") > 0) { + Tensor cum_log_probs = output_tensors->at("cum_log_probs"); + FT_CHECK_WITH_INFO(cum_log_probs.size() == batch_size * beam_width, + "The shape of cum_log_probs does not match with batch_size x beam_width."); + cudaAutoCpy(cum_log_probs.getPtr(), cum_log_probs_, cum_log_probs.size(), stream_); + } +} + +template +size_t Llama::getPipelineParallelRank() +{ + return pipeline_para_.rank_; +} + +template +size_t Llama::getPipelineParallelSize() +{ + return pipeline_para_.world_size_; +} + +template +size_t Llama::getTensorParallelRank() +{ + return tensor_para_.rank_; +} + +template +size_t Llama::getTensorParallelSize() +{ + return tensor_para_.world_size_; +} + +template +bool* Llama::getFinishBuffer() +{ + return finished_buf_; +} + +template class Llama; +template class Llama; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h new file mode 100644 index 000000000..cd79ac0b8 --- /dev/null +++ b/src/fastertransformer/models/llama/Llama.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "src/fastertransformer/layers/DynamicDecodeLayer.h" +#include "src/fastertransformer/models/llama/LlamaContextDecoder.h" +#include "src/fastertransformer/models/llama/LlamaDecoder.h" +#include "src/fastertransformer/models/llama/LlamaWeight.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" +#include "src/fastertransformer/utils/prompt_learning.h" + +namespace fastertransformer { + +template +class Llama: public BaseLayer { +private: + // meta data + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t vocab_size_; + size_t rotary_embedding_dim_; + + static constexpr bool neox_rotary_style_ = true; + static constexpr float layernorm_eps_ = 1e-5f; + + int start_id_; + int end_id_; + size_t hidden_units_; + + size_t local_head_num_; + NcclParam tensor_para_; + NcclParam pipeline_para_; + + std::shared_ptr custom_all_reduce_comm_; + int enable_custom_all_reduce_; + + AttentionType attention_type_; + + size_t vocab_size_padded_; + const bool is_context_qk_buf_float_ = + (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr || + std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); + + // Residual Type + const bool use_gptj_residual_ = true; + + // Prompt Learning Parameters + PromptLearningType prompt_learning_type_; + int prompt_learning_start_id_; // start_id for prompt_learning (only needed by prefix prompts) + bool has_prefix_prompt_; + bool has_prefix_soft_prompt_; + + LlamaDecoder* gpt_decoder_; + LlamaContextDecoder* gpt_context_decoder_; + DynamicDecodeLayer* dynamic_decode_layer_; + + void allocateBuffer() override; + void allocateBuffer( + size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len); + void freeBuffer() override; + + void initialize(); + +protected: + T* padded_embedding_kernel_; + T* padded_embedding_bias_; + const T* padded_embedding_kernel_ptr_; + + T* input_attention_mask_; + + T* decoder_input_buf_; + T* decoder_output_buf_; + T* normed_decoder_output_buf_; + + float* logits_buf_; + float* nccl_logits_buf_; + float* cum_log_probs_; + + bool* finished_buf_; + bool* h_finished_buf_; + int* sequence_lengths_ = nullptr; + int* tiled_total_padding_count_ = nullptr; + uint32_t* seq_limit_len_ = nullptr; + + T* key_cache_; + T* value_cache_; + int* cache_indirections_[2] = {nullptr, nullptr}; + + // prompt_learning weight_batch ptrs + const T** prompt_learning_weight_batch_; + int* tiled_prompt_lengths_buf_; // only needed by prefix prompts + + int* tiled_input_ids_buf_; + int* tiled_input_lengths_buf_; + int* transposed_output_ids_buf_; + int* output_ids_buf_; + int* parent_ids_buf_; + int* start_ids_buf_; + int* end_ids_buf_; + bool* masked_tokens_ = nullptr; + + bool* generation_should_stop_ = nullptr; + + T* context_decoder_input_buf_; + T* context_decoder_output_buf_; + float* output_log_probs_buf_; + + // function pointer callback + using callback_sig = void(std::unordered_map*, void*); + callback_sig* token_generated_cb_ = nullptr; + void* token_generated_ctx_ = nullptr; + + void setOutputTensors(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const size_t max_input_length, + const size_t max_seq_len); + void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors); + +public: + Llama(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + int prompt_learning_start_id, // only needed by p/prompt-tuning + PromptLearningType prompt_learning_type, + bool use_gptj_residual, + float beam_search_diversity_rate, + size_t top_k, + float top_p, + unsigned long long random_seed, + float temperature, + float len_penalty, + float repetition_penalty, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce = 0); + + Llama(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + int start_id, + int end_id, + int prompt_learning_start_id, // only needed by p/prompt-tuning + PromptLearningType prompt_learning_type, + bool use_gptj_residual, + float beam_search_diversity_rate, + size_t top_k, + float top_p, + unsigned long long random_seed, + float temperature, + float len_penalty, + float repetition_penalty, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce = 0); + + Llama(Llama const& Llama); + + ~Llama(); + + void forward(std::vector* output_tensors, + const std::vector* input_tensors, + const LlamaWeight* gpt_weights); + + void forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const LlamaWeight* gpt_weights); + + size_t getPipelineParallelRank(); + size_t getPipelineParallelSize(); + size_t getTensorParallelRank(); + size_t getTensorParallelSize(); + bool* getFinishBuffer(); + + void registerCallback(callback_sig* fn, void* ctx); + void unRegisterCallback(); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc new file mode 100644 index 000000000..f107d38c1 --- /dev/null +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LlamaContextDecoder.h" +#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" +#include "src/fastertransformer/kernels/gpt_kernels.h" + +#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.h" + +namespace fastertransformer { + +template +void LlamaContextDecoder::initialize() +{ + self_attention_layer_ = new TensorParallelGptContextAttentionLayer(0, // max_batch_size + 0, // max_seq_len + head_num_, + size_per_head_, + rotary_embedding_dim_, + neox_rotary_style_, + tensor_para_, + stream_, + cublas_wrapper_, + allocator_, + !use_gptj_residual_, + is_free_buffer_after_forward_, + is_qk_buf_float_, + false, + 0, + custom_all_reduce_comm_, + enable_custom_all_reduce_); + + ffn_layer_ = new TensorParallelGeluFfnLayer(0, // max_batch_size + 0, // max_seq_len + head_num_, + size_per_head_, + 0, // expert_num + inter_size_, + tensor_para_, + stream_, + cublas_wrapper_, + allocator_, + !use_gptj_residual_, + is_free_buffer_after_forward_, + false, + 0, + true, // use_gated_activation = true; + custom_all_reduce_comm_, + enable_custom_all_reduce_); +} + +template +void LlamaContextDecoder::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void LlamaContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) +{ + decoder_normed_input_ = reinterpret_cast( + allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + self_attn_output_ = reinterpret_cast( + allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + ffn_output_ = reinterpret_cast( + allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + decoder_layer_output_ = reinterpret_cast( + allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true); + padding_offset_ = + reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); + cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); + is_allocate_buffer_ = true; +} + +template +void LlamaContextDecoder::freeBuffer() +{ + if (is_allocate_buffer_ == true) { + allocator_->free((void**)(&decoder_normed_input_)); + allocator_->free((void**)(&self_attn_output_)); + allocator_->free((void**)(&ffn_output_)); + allocator_->free((void**)(&decoder_layer_output_)); + allocator_->free((void**)(&h_pinned_token_num_ptr_), true); + allocator_->free((void**)(&padding_offset_)); + allocator_->free((void**)(&cu_seqlens_)); + is_allocate_buffer_ = false; + } +} + +template +bool LlamaContextDecoder::isValidLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_) + && (l < local_num_layer * (pipeline_para_.rank_ + 1)); +} + +template +bool LlamaContextDecoder::isFirstLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_); +} + +template +bool LlamaContextDecoder::isLastLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1); +} + +template +int LlamaContextDecoder::getFirstLayerParallelId() +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return local_num_layer * pipeline_para_.rank_; +} + +template +LlamaContextDecoder::LlamaContextDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + bool use_gptj_residual, + float layernorm_eps, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + AttentionType attention_type, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + rotary_embedding_dim_(rotary_embedding_dim), + neox_rotary_style_(neox_rotary_style), + use_gptj_residual_(use_gptj_residual), + layernorm_eps_(layernorm_eps), + hidden_units_(head_num * size_per_head), + tensor_para_(tensor_para), + pipeline_para_(pipeline_para), + is_qk_buf_float_(is_qk_buf_float), + attention_type_(attention_type), + custom_all_reduce_comm_(custom_all_reduce_comm), + enable_custom_all_reduce_(enable_custom_all_reduce) +{ + initialize(); +} + +template +LlamaContextDecoder::LlamaContextDecoder(LlamaContextDecoder const& decoder): + BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_), + head_num_(decoder.head_num_), + size_per_head_(decoder.size_per_head_), + inter_size_(decoder.inter_size_), + num_layer_(decoder.num_layer_), + rotary_embedding_dim_(decoder.rotary_embedding_dim_), + neox_rotary_style_(decoder.neox_rotary_style_), + use_gptj_residual_(decoder.use_gptj_residual_), + layernorm_eps_(decoder.layernorm_eps_), + hidden_units_(decoder.hidden_units_), + tensor_para_(decoder.tensor_para_), + pipeline_para_(decoder.pipeline_para_), + is_qk_buf_float_(decoder.is_qk_buf_float_), + attention_type_(decoder.attention_type_), + custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), + enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) +{ + initialize(); +} + +template +LlamaContextDecoder::~LlamaContextDecoder() +{ + delete self_attention_layer_; + delete ffn_layer_; + freeBuffer(); +} + +template +void LlamaContextDecoder::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* gpt_decoder_layer_weight) +{ + std::unordered_map input_tensors_map{{"decoder_input", input_tensors->at(0)}, + {"attention_mask", input_tensors->at(1)}, + {"input_lengths", input_tensors->at(2)}}; + std::unordered_map output_tensors_map{{"decoder_output", output_tensors->at(0)}, + {"key_cache", output_tensors->at(1)}, + {"value_cache", output_tensors->at(2)}, + {"last_token_hidden_units", output_tensors->at(3)}}; + + forward(&output_tensors_map, &input_tensors_map, gpt_decoder_layer_weight); +} + +template +void LlamaContextDecoder::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* gpt_decoder_layer_weight) +{ + // input tensors: + // decoder_input [batch_size, seq_len, hidden_dimension], + // attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length] + // input_lengths [batch_size] + // d_prefix_prompt_batch [batch_size], + // each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head] + // prefix_prompt_lengths [batch size] + + // output tensors: + // decoder_output [batch_size, seq_len, hidden_dimension], + // key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x] + // value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] + // last_token_hidden_units [batch_size, hidden_dimension] + + // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * local_batch_size'. + // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during + // computing. + + FT_CHECK(input_tensors->size() == 5); + FT_CHECK(output_tensors->size() == 4); + + const int batch_size = input_tensors->at("decoder_input").shape[0]; + const int seq_len = input_tensors->at("decoder_input").shape[1]; + const int max_prompt_length = + input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2]; + const DataType data_type = getTensorType(); + allocateBuffer(batch_size, seq_len); + + T* decoder_input = input_tensors->at("decoder_input").getPtr(); + T* decoder_output = output_tensors->at("decoder_output").getPtr(); + const T* attention_mask = input_tensors->at("attention_mask").getPtr(); + const T** d_prefix_prompt_batch = input_tensors->at("d_prefix_prompt_batch").getPtr(); + const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr(); + + const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_); + FT_CHECK(batch_size % local_batch_size == 0); + const int iteration_num = batch_size / local_batch_size; + + Tensor& k_cache = output_tensors->at("key_cache"); + Tensor& v_cache = output_tensors->at("value_cache"); + std::vector self_k_cache_size; + self_k_cache_size.push_back(local_batch_size); + for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { + self_k_cache_size.push_back(*t); + } + std::vector self_v_cache_size; + self_v_cache_size.push_back(local_batch_size); + for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) { + self_v_cache_size.push_back(*t); + } + + AttentionType attention_type = (d_prefix_prompt_lengths != nullptr) ? + getUnfusedAttentionType(attention_type_) : + attention_type_; + const bool is_unpadded_mha = isUnPaddedMHA(attention_type); + + for (int ite = 0; ite < iteration_num; ite++) { + size_t h_token_num = local_batch_size * seq_len; + if (is_unpadded_mha) { + const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); + invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, + &h_token_num, + padding_offset_, + cu_seqlens_, + base_input_lengths + ite * local_batch_size, + local_batch_size, + seq_len, + stream_); + } + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l) == false) { + continue; + } + + if (l == 0 && is_unpadded_mha) { + invokeRemovePadding(decoder_layer_output_, + decoder_input + ite * local_batch_size * seq_len * hidden_units_, + padding_offset_, + h_token_num, + hidden_units_, + stream_); + } + + const bool is_final = false; // TODO(bhsueh) remove this flag + T* layer_input = decoder_layer_output_; + T* layer_output = decoder_layer_output_; + if (!is_unpadded_mha) { + if (l == 0) { + layer_input = decoder_input; + layer_input += ite * local_batch_size * seq_len * hidden_units_; + } + if (l == num_layer_ - 1) { + layer_output = decoder_output; + layer_output += ite * local_batch_size * seq_len * hidden_units_; + } + } + + if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { + int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_; + ftNcclRecv(layer_input + data_size * tensor_para_.rank_, + data_size, + pipeline_para_.rank_ - 1, + pipeline_para_, + stream_); + if (tensor_para_.world_size_ > 1) { + ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_); + } + } + + invokeGeneralT5LayerNorm(decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + layernorm_eps_, + h_token_num, + hidden_units_, + stream_); + sync_check_cuda_error(); + + TensorMap self_attention_input_tensors{ + {"input_query", + Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, + {"attention_mask", + Tensor{MEMORY_GPU, + data_type, + {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, + attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, + {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, + {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, + {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; + self_attention_input_tensors.insertIfValid( + "d_prefix_prompt_batch", + Tensor{MEMORY_GPU, + data_type, + {(size_t)local_batch_size}, + d_prefix_prompt_batch != nullptr ? d_prefix_prompt_batch + ite * local_batch_size : nullptr}); + self_attention_input_tensors.insertIfValid("d_prefix_prompt_lengths", + Tensor{MEMORY_GPU, + TYPE_INT32, + {(size_t)local_batch_size}, + d_prefix_prompt_lengths != nullptr ? + d_prefix_prompt_lengths + ite * local_batch_size : + nullptr}); + + if (is_unpadded_mha) { + self_attention_input_tensors.insert("padding_offset", + Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_}); + self_attention_input_tensors.insert( + "cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(local_batch_size + 1)}, cu_seqlens_}); + } + + size_t cache_offset = l - getFirstLayerParallelId(); + for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) { + cache_offset *= *t; + }; + size_t ite_cache_offset = ite * local_batch_size; + for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { + ite_cache_offset *= *t; + } + cache_offset += ite_cache_offset; + + TensorMap self_attention_output_tensors{ + {"hidden_features", + Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, + {"value_cache", + Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + + self_attention_layer_->forward(&self_attention_output_tensors, + &self_attention_input_tensors, + &gpt_decoder_layer_weight->at(l)->self_attention_weights); + + if (is_final == false) { + if (use_gptj_residual_) { + invokeGeneralLayerNorm(decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + layernorm_eps_, + h_token_num, + hidden_units_, + (float*)nullptr, + 0, + stream_); + } + else { + invokeGeneralAddResidualT5PreLayerNorm( + self_attn_output_, + decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + layernorm_eps_, + h_token_num, + hidden_units_, + stream_); + } + + TensorMap ffn_input_tensors( + {{"ffn_input", + Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}}); + TensorMap ffn_output_tensors({{"ffn_output", + Tensor{MEMORY_GPU, + data_type, + {h_token_num, (size_t)hidden_units_}, + use_gptj_residual_ ? ffn_output_ : layer_output}}}); + ffn_layer_->forward( + &ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights); + + if (use_gptj_residual_) { + // Original workflow: + // layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias) + // Our workflow: + // layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input / + // TP_size) + // They are equivalent on math, but we can use same buffer for layer_input and layer_output + + invokeAddBiasAttentionFfnResidual(layer_output, + ffn_output_, + self_attn_output_, + layer_input, + gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + h_token_num, + hidden_units_, + tensor_para_.world_size_, + stream_); + if (tensor_para_.world_size_ > 1) { + ftNcclAllReduceSum( + layer_output, layer_output, h_token_num * hidden_units_, tensor_para_, stream_); + } + } + else { + invokeAddBiasResidual(layer_output, + self_attn_output_, + gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + h_token_num, + hidden_units_, + stream_); + } + + sync_check_cuda_error(); + + if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 + && pipeline_para_.world_size_ > 1) { + int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_; + ftNcclSend(layer_output + data_size * tensor_para_.rank_, + data_size, + pipeline_para_.rank_ + 1, + pipeline_para_, + stream_); + } + + if ((l == num_layer_ - 1) && is_unpadded_mha) { + invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_, + decoder_layer_output_, + padding_offset_, + h_token_num, + head_num_ * size_per_head_, + stream_); + } + } + } + } + + // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer + invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr(), + output_tensors->at("decoder_output").getPtr(), + input_tensors->at("input_lengths").getPtr(), + seq_len, + batch_size, + hidden_units_, + stream_); + sync_check_cuda_error(); + if (is_free_buffer_after_forward_ == true) { + freeBuffer(); + } +} + +template class LlamaContextDecoder; +template class LlamaContextDecoder; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.h b/src/fastertransformer/models/llama/LlamaContextDecoder.h new file mode 100644 index 000000000..788d1d45d --- /dev/null +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "src/fastertransformer/kernels/add_residual_kernels.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/layers/BaseLayer.h" +#include "src/fastertransformer/layers/FfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" +#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h" +#include "src/fastertransformer/utils/Tensor.h" +#include "src/fastertransformer/utils/allocator.h" +#include "src/fastertransformer/utils/cublasMMWrapper.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" +#include "src/fastertransformer/utils/nccl_utils.h" + +namespace fastertransformer { + +template +class LlamaContextDecoder: public BaseLayer { +private: + // meta data + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t rotary_embedding_dim_; + bool neox_rotary_style_; + bool use_gptj_residual_; + float layernorm_eps_; + + // calculated data + size_t hidden_units_; + + NcclParam tensor_para_; + NcclParam pipeline_para_; + + std::shared_ptr custom_all_reduce_comm_; + int enable_custom_all_reduce_; + + AttentionType attention_type_; + + bool is_qk_buf_float_; + + BaseAttentionLayer* self_attention_layer_; + FfnLayer* ffn_layer_; + + void allocateBuffer() override; + void allocateBuffer(size_t batch_size, size_t seq_len); + void freeBuffer() override; + + bool isValidLayerParallelId(uint l); + bool isFirstLayerParallelId(uint l); + bool isLastLayerParallelId(uint l); + int getFirstLayerParallelId(); + + void initialize(); + +protected: + T* decoder_normed_input_ = nullptr; + T* self_attn_output_ = nullptr; + T* ffn_output_ = nullptr; + T* decoder_layer_output_ = nullptr; + size_t* h_pinned_token_num_ptr_ = nullptr; + int* padding_offset_ = nullptr; + int* cu_seqlens_ = nullptr; + +public: + LlamaContextDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + bool use_gptj_residual, + float layernorm_eps, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + bool is_qk_buf_float, + AttentionType attention_type = AttentionType::FUSED_MHA, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce_ = 0); + + LlamaContextDecoder(LlamaContextDecoder const& decoder); + + ~LlamaContextDecoder(); + + void forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* decoder_layer_weights); + + void forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* gpt_decoder_layer_weight); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc new file mode 100644 index 000000000..d5fb58fee --- /dev/null +++ b/src/fastertransformer/models/llama/LlamaDecoder.cc @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LlamaDecoder.h" +#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h" + +namespace fastertransformer { + +template +void LlamaDecoder::initialize() +{ + self_attention_layer_ = new TensorParallelDecoderSelfAttentionLayer(0, // max_batch_size + head_num_, + size_per_head_, + rotary_embedding_dim_, + neox_rotary_style_, + tensor_para_, + stream_, + cublas_wrapper_, + allocator_, + !use_gptj_residual_, + is_free_buffer_after_forward_, + false, + 0, + custom_all_reduce_comm_, + enable_custom_all_reduce_); + + ffn_layer_ = new TensorParallelGeluFfnLayer(0, // max_batch_size + 1, + head_num_, + size_per_head_, + 0, // expert_num + inter_size_, + tensor_para_, + stream_, + cublas_wrapper_, + allocator_, + !use_gptj_residual_, + is_free_buffer_after_forward_, + false, + 0, + true, // use_gated_activation = true; + custom_all_reduce_comm_, + enable_custom_all_reduce_); +} + +template +void LlamaDecoder::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void LlamaDecoder::allocateBuffer(size_t batch_size) +{ + decoder_normed_input_ = reinterpret_cast( + allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * hidden_units_, false)); + self_attn_output_ = + reinterpret_cast(allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * hidden_units_, false)); + ffn_output_ = + reinterpret_cast(allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * hidden_units_, false)); + decoder_layer_output_ = reinterpret_cast( + allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * hidden_units_, false)); + is_allocate_buffer_ = true; +} + +template +void LlamaDecoder::freeBuffer() +{ + if (is_allocate_buffer_ == true) { + allocator_->free((void**)(&decoder_normed_input_)); + allocator_->free((void**)(&self_attn_output_)); + allocator_->free((void**)(&ffn_output_)); + allocator_->free((void**)(&decoder_layer_output_)); + is_allocate_buffer_ = false; + } +} + +template +bool LlamaDecoder::isValidLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_) + && (l < local_num_layer * (pipeline_para_.rank_ + 1)); +} + +template +bool LlamaDecoder::isFirstLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_); +} + +template +bool LlamaDecoder::isLastLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1); +} + +template +int LlamaDecoder::getFirstLayerParallelId() +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_)); + return local_num_layer * pipeline_para_.rank_; +} + +template +LlamaDecoder::LlamaDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + bool use_gptj_residual, + float layernorm_eps, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + std::shared_ptr custom_all_reduce_comm, + int enable_custom_all_reduce): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + rotary_embedding_dim_(rotary_embedding_dim), + neox_rotary_style_(neox_rotary_style), + use_gptj_residual_(use_gptj_residual), + layernorm_eps_(layernorm_eps), + hidden_units_(head_num_ * size_per_head), + tensor_para_(tensor_para), + pipeline_para_(pipeline_para), + custom_all_reduce_comm_(custom_all_reduce_comm), + enable_custom_all_reduce_(enable_custom_all_reduce) +{ + initialize(); +} + +template +LlamaDecoder::LlamaDecoder(LlamaDecoder const& decoder): + BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_), + head_num_(decoder.head_num_), + size_per_head_(decoder.size_per_head_), + inter_size_(decoder.inter_size_), + num_layer_(decoder.num_layer_), + rotary_embedding_dim_(decoder.rotary_embedding_dim_), + neox_rotary_style_(decoder.neox_rotary_style_), + use_gptj_residual_(decoder.use_gptj_residual_), + layernorm_eps_(decoder.layernorm_eps_), + hidden_units_(decoder.hidden_units_), + tensor_para_(decoder.tensor_para_), + pipeline_para_(decoder.pipeline_para_), + custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), + enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) +{ + initialize(); +} + +template +LlamaDecoder::~LlamaDecoder() +{ + delete self_attention_layer_; + delete ffn_layer_; + freeBuffer(); +} + +template +void LlamaDecoder::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* gpt_decoder_layer_weight) +{ + FT_CHECK(false); +} + +template +void LlamaDecoder::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* gpt_decoder_layer_weight) +{ + // input tensors: + // decoder_input [local_batch_size, hidden_dimension], + // finished [local_batch_size], + // sequence_lengths [local_batch_size] + // total_padding_tokens [local_batch_size], + // max_input_length [1] on cpu + // d_prefix_prompt_lengths [local_batch_size], on GPU + // max_prefix_prompt_length [1] on cpu + // step [1] on cpu + // ite [1] on cpu + // cache_indirection [local_batch_size / beam_width, beam_width, memory_len] + // Here, local_batch_size contains the beam_width, so local_batch_size / beam_width + // is real local_batch_size. + // masked_tokens[local_batch_size, memory_len] + + // output tensors: + // decoder_output [local_batch_size, hidden_dimension], + // key_cache [num_layer, batch_size, head_num, size_per_head // x, memory_len, x] + // value_cache [num_layer, batch_size, head_num, memory_len, size_per_head] + + FT_CHECK(input_tensors->size() == 11); + FT_CHECK(output_tensors->size() == 3); + + const DataType data_type = getTensorType(); + const size_t local_batch_size = input_tensors->at("decoder_input").shape[0]; + allocateBuffer(local_batch_size); + const int ite = input_tensors->at("ite").getVal(); + + T* decoder_input = input_tensors->at("decoder_input").getPtr(); + T* decoder_output = output_tensors->at("decoder_output").getPtr(); + + Tensor& k_cache = output_tensors->at("key_cache"); + Tensor& v_cache = output_tensors->at("value_cache"); + std::vector self_k_cache_size; + self_k_cache_size.push_back(local_batch_size); + for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { + self_k_cache_size.push_back(*t); + } + std::vector self_v_cache_size; + self_v_cache_size.push_back(local_batch_size); + for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) { + self_v_cache_size.push_back(*t); + } + + for (uint l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l) == false) { + continue; + } + T* layer_input = (l == 0) ? decoder_input : decoder_layer_output_; + T* layer_output = (l == num_layer_ - 1) ? decoder_output : decoder_layer_output_; + + if (isFirstLayerParallelId(l) == true && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) { + int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_; + // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_, + // stream_); + + ftNcclRecv(layer_input + data_size * tensor_para_.rank_, + data_size, + pipeline_para_.rank_ - 1, + pipeline_para_, + stream_); + if (tensor_para_.world_size_ > 1) { + ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_); + } + } + + invokeGeneralT5LayerNorm(decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + layernorm_eps_, + local_batch_size, + hidden_units_, + stream_); + sync_check_cuda_error(); + + TensorMap self_attention_input_tensors(*input_tensors); + self_attention_input_tensors.insert( + "input_query", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}); + + size_t cache_offset = l - getFirstLayerParallelId(); + for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) { + cache_offset *= *t; + }; + size_t ite_cache_offset = ite * local_batch_size; + for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { + ite_cache_offset *= *t; + } + cache_offset += ite_cache_offset; + + TensorMap self_attention_output_tensors{ + {"hidden_features", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, self_attn_output_}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + + self_attention_layer_->forward(&self_attention_output_tensors, + &self_attention_input_tensors, + &gpt_decoder_layer_weight->at(l)->self_attention_weights); + if (use_gptj_residual_) { + invokeGeneralLayerNorm(decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + layernorm_eps_, + local_batch_size, + hidden_units_, + (float*)nullptr, + 0, + stream_); + } + else { + invokeGeneralAddResidualT5PreLayerNorm( + self_attn_output_, + decoder_normed_input_, + layer_input, + gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + layernorm_eps_, + local_batch_size, + hidden_units_, + stream_); + } + + TensorMap ffn_input_tensors( + {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}}); + TensorMap ffn_output_tensors({{"ffn_output", + Tensor{MEMORY_GPU, + data_type, + {local_batch_size, hidden_units_}, + use_gptj_residual_ ? ffn_output_ : layer_output}}}); + ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights); + + if (use_gptj_residual_) { + // Original workflow: + // layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias) + // Our workflow: + // layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input / TP_size) + // They are equivalent on math, but we can use same buffer for layer_input and layer_output + invokeAddBiasAttentionFfnResidual(layer_output, + ffn_output_, + self_attn_output_, + layer_input, + gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + local_batch_size, + hidden_units_, + tensor_para_.world_size_, + stream_); + if (tensor_para_.world_size_ > 1) { + ftNcclAllReduceSum(layer_output, layer_output, local_batch_size * hidden_units_, tensor_para_, stream_); + } + } + else { + invokeAddBiasResidual(layer_output, + self_attn_output_, + gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + local_batch_size, + hidden_units_, + stream_); + } + + sync_check_cuda_error(); + + if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1 + && pipeline_para_.world_size_ > 1) { + int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_; + // ftNcclSend(layer_output, local_batch_size * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_, + // stream_); + + ftNcclSend(layer_output + data_size * tensor_para_.rank_, + data_size, + pipeline_para_.rank_ + 1, + pipeline_para_, + stream_); + } + } + + if (is_free_buffer_after_forward_ == true) { + freeBuffer(); + } +} + +template class LlamaDecoder; +template class LlamaDecoder; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoder.h b/src/fastertransformer/models/llama/LlamaDecoder.h new file mode 100644 index 000000000..6cdd7df27 --- /dev/null +++ b/src/fastertransformer/models/llama/LlamaDecoder.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "src/fastertransformer/kernels/add_residual_kernels.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/layers/BaseLayer.h" +#include "src/fastertransformer/layers/FfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" +#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h" +#include "src/fastertransformer/utils/Tensor.h" +#include "src/fastertransformer/utils/allocator.h" +#include "src/fastertransformer/utils/cublasMMWrapper.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" +#include "src/fastertransformer/utils/nccl_utils.h" + +namespace fastertransformer { + +template +class LlamaDecoder: public BaseLayer { +private: +protected: + void allocateBuffer() override; + void allocateBuffer(size_t batch_size); + void freeBuffer() override; + bool isValidLayerParallelId(uint l); + bool isFirstLayerParallelId(uint l); + bool isLastLayerParallelId(uint l); + int getFirstLayerParallelId(); + virtual void initialize(); + + // meta data + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t rotary_embedding_dim_; + bool neox_rotary_style_; + bool use_gptj_residual_; + size_t hidden_units_; + float layernorm_eps_; + + NcclParam tensor_para_; + NcclParam pipeline_para_; + + std::shared_ptr custom_all_reduce_comm_; + int enable_custom_all_reduce_; + + T* decoder_normed_input_ = nullptr; + T* self_attn_output_ = nullptr; + T* ffn_output_ = nullptr; + T* decoder_layer_output_ = nullptr; + + BaseAttentionLayer* self_attention_layer_; + FfnLayer* ffn_layer_; + +public: + LlamaDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + bool neox_rotary_style, + bool use_gptj_residual, + float layernorm_eps, + NcclParam tensor_para, + NcclParam pipeline_para, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + std::shared_ptr custom_all_reduce_comm = nullptr, + int enable_custom_all_reduce_ = 0); + + LlamaDecoder(LlamaDecoder const& decoder); + + virtual ~LlamaDecoder(); + + virtual void forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* decoder_layer_weights); + + virtual void forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* decoder_layer_weights); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc new file mode 100644 index 000000000..ecd539a75 --- /dev/null +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h" +#include "src/fastertransformer/utils/memory_utils.h" + +namespace fastertransformer { + +template +LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const int hidden_units, + const int inter_size, + const int tensor_para_size, + const int tensor_para_rank, + const bool use_gptj_residual): + hidden_units_(hidden_units), + inter_size_(inter_size), + tensor_para_size_(tensor_para_size), + tensor_para_rank_(tensor_para_rank), + use_gptj_residual_(use_gptj_residual) +{ + mallocWeights(); + setWeightPtr(); +} + +template +LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() +{ + if (is_maintain_buffer == true) { + for (int i = 0; i < 12; i++) { + if (!use_gptj_residual_ && i != attention_dense_bias_weight_id) { + cudaFree(weights_ptr[i]); + } + } + + pre_layernorm_weights.beta = nullptr; + pre_layernorm_weights.gamma = nullptr; + self_attention_weights.query_weight.kernel = nullptr; + self_attention_weights.query_weight.bias = nullptr; + self_attention_weights.attention_output_weight.kernel = nullptr; + self_attention_weights.attention_output_weight.bias = nullptr; + post_attention_layernorm_weights.beta = nullptr; + post_attention_layernorm_weights.gamma = nullptr; + + ffn_weights.intermediate_weight.kernel = nullptr; + ffn_weights.intermediate_weight.bias = nullptr; + ffn_weights.intermediate_weight2.kernel = nullptr; + ffn_weights.intermediate_weight2.bias = nullptr; + ffn_weights.output_weight.kernel = nullptr; + ffn_weights.output_weight.bias = nullptr; + is_maintain_buffer = false; + } +} + +template +LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other): + hidden_units_(other.hidden_units_), + inter_size_(other.inter_size_), + tensor_para_size_(other.tensor_para_size_), + tensor_para_rank_(other.tensor_para_rank_), + use_gptj_residual_(other.use_gptj_residual_) +{ + mallocWeights(); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); + if (!use_gptj_residual_) { + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + } + + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); + + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_); + + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); + cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); + cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); + setWeightPtr(); +} + +template +LlamaDecoderLayerWeight& LlamaDecoderLayerWeight::operator=(const LlamaDecoderLayerWeight& other) +{ + hidden_units_ = other.hidden_units_; + inter_size_ = other.inter_size_; + tensor_para_size_ = other.tensor_para_size_; + tensor_para_rank_ = other.tensor_para_rank_; + use_gptj_residual_ = other.use_gptj_residual_; + + mallocWeights(); + + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); + if (!use_gptj_residual_) { + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + } + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); + cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); + cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); + setWeightPtr(); + return *this; +} + +template +void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType model_file_type) +{ + FT_CHECK(is_maintain_buffer == true); + const std::string rank_spec = std::to_string(tensor_para_rank_); + + // fill all bias to zeros + deviceFill(weights_ptr[0], (size_t)hidden_units_, (T)0.0); + loadWeightFromBin( + weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type); + + loadWeightFromBin(weights_ptr[2], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, + dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", + model_file_type); + deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0); + + loadWeightFromBin(weights_ptr[4], + {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".attention.dense.weight." + rank_spec + ".bin", + model_file_type); + if (!use_gptj_residual_) { + deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0); + } + + // FIXME(sunpeng17): check if the weights are correct + loadWeightFromBin(weights_ptr[6], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin", + model_file_type); + deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); + + loadWeightFromBin(weights_ptr[8], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin", + model_file_type); + deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); + + loadWeightFromBin(weights_ptr[10], + {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin", + model_file_type); + deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0); + + deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0); + loadWeightFromBin( + weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type); +} + +template +void LlamaDecoderLayerWeight::setWeightPtr() +{ + pre_layernorm_weights.beta = weights_ptr[0]; + pre_layernorm_weights.gamma = weights_ptr[1]; + self_attention_weights.query_weight.kernel = weights_ptr[2]; + self_attention_weights.query_weight.bias = weights_ptr[3]; + self_attention_weights.attention_output_weight.kernel = weights_ptr[4]; + self_attention_weights.attention_output_weight.bias = use_gptj_residual_ ? nullptr : weights_ptr[5]; + + ffn_weights.intermediate_weight.kernel = weights_ptr[6]; + ffn_weights.intermediate_weight.bias = weights_ptr[7]; + ffn_weights.intermediate_weight2.kernel = weights_ptr[8]; + ffn_weights.intermediate_weight2.bias = weights_ptr[9]; + ffn_weights.output_weight.kernel = weights_ptr[10]; + ffn_weights.output_weight.bias = weights_ptr[11]; + + post_attention_layernorm_weights.beta = weights_ptr[12]; + post_attention_layernorm_weights.gamma = weights_ptr[13]; + is_maintain_buffer = true; +} + +template +void LlamaDecoderLayerWeight::mallocWeights() +{ + deviceMalloc(&weights_ptr[0], hidden_units_); // pre layernorm beta + deviceMalloc(&weights_ptr[1], hidden_units_); // pre layernorm gamma + deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel + deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); // qkv bias + deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight + if (!use_gptj_residual_) { + deviceMalloc(&weights_ptr[5], hidden_units_); // attention output bias + } + + deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel + deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_); // intermediate_weight bias + deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel + deviceMalloc(&weights_ptr[9], inter_size_ / tensor_para_size_); // intermediate_weight2 bias + deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel + deviceMalloc(&weights_ptr[11], hidden_units_); // output_weight bias + deviceMalloc(&weights_ptr[12], hidden_units_); // post attn layernorm beta + deviceMalloc(&weights_ptr[13], hidden_units_); // post attn layernorm gamma +} + +template struct LlamaDecoderLayerWeight; +template struct LlamaDecoderLayerWeight; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h new file mode 100644 index 000000000..008e1a3b4 --- /dev/null +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/layers/FfnWeight.h" +#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" +#include "src/fastertransformer/utils/cuda_utils.h" + +namespace fastertransformer { + +template +struct LlamaDecoderLayerWeight { +public: + LlamaDecoderLayerWeight() = default; + LlamaDecoderLayerWeight(const int hidden_units, + const int inter_size, + const int tensor_para_size = 1, + const int tensor_para_rank = 0, + const bool use_gptj_residual = true); + ~LlamaDecoderLayerWeight(); + LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other); + LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other); + + void loadModel(std::string dir_path, FtCudaDataType model_file_type); + + LayerNormWeight pre_layernorm_weights; + AttentionWeight self_attention_weights; + LayerNormWeight post_attention_layernorm_weights; + FfnWeight ffn_weights; + +private: + int hidden_units_; + int inter_size_; + int tensor_para_size_; + int tensor_para_rank_; + bool use_gptj_residual_; + const int attention_dense_bias_weight_id = 5; + bool is_maintain_buffer = false; + T* weights_ptr[14]; + + void setWeightPtr(); + void mallocWeights(); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc new file mode 100644 index 000000000..65c2f762d --- /dev/null +++ b/src/fastertransformer/models/llama/LlamaWeight.cc @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LlamaWeight.h" + +namespace fastertransformer { + +template +LlamaWeight::LlamaWeight(const int hidden_units, + const int inter_size, + const int vocab_size, + const int num_layer, + const int max_seq_len, + const int tensor_para_size, + const int tensor_para_rank, + const int layer_para_size, + const int layer_para_rank, + const bool use_gptj_residual, + PromptLearningType prompt_learning_type, + std::map> prompt_learning_pair): + hidden_units_(hidden_units), + inter_size_(inter_size), + vocab_size_(vocab_size), + num_layer_(num_layer), + max_seq_len_(max_seq_len), + tensor_para_size_(tensor_para_size), + tensor_para_rank_(tensor_para_rank), + layer_para_size_(layer_para_size), + layer_para_rank_(layer_para_rank), + use_gptj_residual_(use_gptj_residual), + prompt_learning_type_(prompt_learning_type), + prompt_learning_pair_(prompt_learning_pair) +{ + FT_CHECK(num_layer_ % layer_para_size_ == 0); + // set prompt weight size + if (prompt_learning_type_ == PromptLearningType::prefix_prompt) { + prompt_token_weight_size_ = 2 * num_layer_ * hidden_units_ / tensor_para_size_; + } + else if (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) { + prompt_token_weight_size_ = hidden_units_; + } + + // set if load and malloc prompt weights + malloc_load_prompt_weights_ = !prompt_learning_pair_.empty() + && (prompt_learning_type_ == PromptLearningType::p_prompt_tuning + || prompt_learning_type_ == PromptLearningType::prefix_prompt); + + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l)) { + decoder_layer_weights.push_back(new LlamaDecoderLayerWeight( + hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_)); + } + else { + // Layer-parallelism: allocate empty layer because + // this rank does not compute it: + decoder_layer_weights.push_back(new LlamaDecoderLayerWeight(0, 0)); + } + } + + mallocWeights(); + setWeightPtr(); +} + +template +LlamaWeight::~LlamaWeight() +{ + if (is_maintain_buffer == true) { + for (int i = 0; i < weights_ptr.size(); i++) { + deviceFree(weights_ptr[i]); + } + + pre_decoder_embedding_table = nullptr; + post_decoder_layernorm.beta = nullptr; + post_decoder_layernorm.gamma = nullptr; + post_decoder_embedding.kernel = nullptr; + is_maintain_buffer = false; + } +} + +template +LlamaWeight::LlamaWeight(const LlamaWeight& other): + hidden_units_(other.hidden_units_), + inter_size_(other.inter_size_), + vocab_size_(other.vocab_size_), + num_layer_(other.num_layer_), + max_seq_len_(other.max_seq_len_), + tensor_para_size_(other.tensor_para_size_), + tensor_para_rank_(other.tensor_para_rank_), + layer_para_size_(other.layer_para_size_), + layer_para_rank_(other.layer_para_rank_), + use_gptj_residual_(other.use_gptj_residual_), + prompt_token_weight_size_(other.prompt_token_weight_size_), + malloc_load_prompt_weights_(other.malloc_load_prompt_weights_), + prompt_learning_type_(other.prompt_learning_type_), + prompt_learning_pair_(other.prompt_learning_pair_) +{ + mallocWeights(); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); + + // prompt learning table: malloc weights and set weight ptr + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + std::string task_name = prompt.first; + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t prompt_id = num_base_weights + (size_t)task_name_id; + + // cuda device to device memcpy prompt table weights buffer memory + cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_); + } + } + + setWeightPtr(); + + decoder_layer_weights.clear(); + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(other.decoder_layer_weights[l]); + } +} + +template +LlamaWeight& LlamaWeight::operator=(const LlamaWeight& other) +{ + hidden_units_ = other.hidden_units_; + inter_size_ = other.inter_size_; + vocab_size_ = other.vocab_size_; + num_layer_ = other.num_layer_; + max_seq_len_ = other.max_seq_len_; + tensor_para_size_ = other.tensor_para_size_; + tensor_para_rank_ = other.tensor_para_rank_; + layer_para_size_ = other.layer_para_size_; + layer_para_rank_ = other.layer_para_rank_; + use_gptj_residual_ = other.use_gptj_residual_; + prompt_token_weight_size_ = other.prompt_token_weight_size_; + malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_; + prompt_learning_type_ = other.prompt_learning_type_; + prompt_learning_pair_ = other.prompt_learning_pair_; + + mallocWeights(); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); + + // prompt learning table: malloc weights and set weight ptr + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + std::string task_name = prompt.first; + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t prompt_id = num_base_weights + (size_t)task_name_id; + + // cuda device to device memcpy prompt table weights buffer memory + cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_); + } + } + + setWeightPtr(); + + decoder_layer_weights.clear(); + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(other.decoder_layer_weights[l]); + } + return *this; +} + +template +void LlamaWeight::setWeightPtr() +{ + prompt_learning_table.resize(prompt_learning_pair_.size()); + + pre_decoder_embedding_table = weights_ptr[0]; + post_decoder_layernorm.beta = weights_ptr[1]; + post_decoder_layernorm.gamma = weights_ptr[2]; + post_decoder_embedding.kernel = weights_ptr[3]; + + // prompt learning tables: set weight ptr + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t task_weight_id = num_base_weights + (size_t)task_name_id; + + // set weight ptr + prompt_learning_table[task_name_id] = {weights_ptr[task_weight_id], prompt_length}; + } + } +} + +template +void LlamaWeight::mallocWeights() +{ + weights_ptr.resize(num_base_weights + prompt_learning_pair_.size()); + + deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_); + deviceMalloc(&weights_ptr[1], hidden_units_); + deviceMalloc(&weights_ptr[2], hidden_units_); + deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_); + + // prompt learning tables: malloc weights + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t task_weight_id = num_base_weights + (size_t)task_name_id; + + // malloc weights + T* prompt_weights_ptr = nullptr; + deviceMalloc(&prompt_weights_ptr, prompt_length * prompt_token_weight_size_); + weights_ptr[task_weight_id] = prompt_weights_ptr; + } + } + is_maintain_buffer = true; +} + +template +void LlamaWeight::loadModel(std::string dir_path) +{ + FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "llama"); + FT_CHECK(is_maintain_buffer == true); + + loadWeightFromBin( + weights_ptr[0], {(size_t)(vocab_size_ * hidden_units_)}, dir_path + "/model.wte.weight.bin", model_file_type); + deviceFill(weights_ptr[1], (size_t)hidden_units_, (T)0.0); + loadWeightFromBin( + weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.weight.bin", model_file_type); + loadWeightFromBin(weights_ptr[3], + {(size_t)(vocab_size_ * hidden_units_)}, + dir_path + "/model.lm_head.weight.bin", + model_file_type); + + // prompt table: load weights from bin + if (malloc_load_prompt_weights_) { + for (auto const& prompt : prompt_learning_pair_) { + std::string task_name = prompt.first; + int task_name_id = prompt.second.first; + int prompt_length = prompt.second.second; + size_t task_weight_id = num_base_weights + (size_t)task_name_id; + + std::string prompt_weight_path_name = (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) ? + (dir_path + "/model.prompt_table." + task_name + ".weight.bin") : + (dir_path + "/model.prefix_prompt." + task_name + ".weight." + + std::to_string(tensor_para_rank_) + ".bin"); + + if (prompt_length > 0) { + loadWeightFromBin(weights_ptr[task_weight_id], + {(size_t)(prompt_length * (int)prompt_token_weight_size_)}, + prompt_weight_path_name, + model_file_type); + } + } + } + + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l)) { + decoder_layer_weights[l]->loadModel(dir_path + "/model.layers." + std::to_string(l), model_file_type); + } + } +} + +template +void LlamaWeight::resizeLayer(const int num_layer) +{ + num_layer_ = num_layer; + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(new LlamaDecoderLayerWeight()); + } +} + +template +bool LlamaWeight::isValidLayerParallelId(int l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / layer_para_size_)); + return l < num_layer_ && (l >= local_num_layer * layer_para_rank_) + && (l < local_num_layer * (layer_para_rank_ + 1)); +} + +template struct LlamaWeight; +template struct LlamaWeight; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h new file mode 100644 index 000000000..2f5b2632a --- /dev/null +++ b/src/fastertransformer/models/llama/LlamaWeight.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h" +#include "src/fastertransformer/utils/memory_utils.h" +#include "src/fastertransformer/utils/prompt_learning.h" + +namespace fastertransformer { + +template +struct LlamaWeight { + + LlamaWeight() = default; + LlamaWeight( + const int hidden_units, + const int inter_size, + const int vocab_size, + const int num_layer, + const int max_seq_len, + const int tensor_para_size = 1, + const int tensor_para_rank = 0, + const int layer_para_size = 1, + const int layer_para_rank = 0, + const bool use_gptj_residual_ = true, + PromptLearningType prompt_learning_type = PromptLearningType::no_prompt, + std::map> prompt_learning_pair = std::map>{}); + + ~LlamaWeight(); + LlamaWeight(const LlamaWeight& other); + LlamaWeight& operator=(const LlamaWeight& other); + + void loadModel(std::string dir_path); + + void resizeLayer(const int num_layer); + + std::vector*> decoder_layer_weights; + const T* pre_decoder_embedding_table = nullptr; + // GPT-J does not use embedding table, but we leave the ptr such that + // GptNeoX::forward and Gpt::forward become identical + const T* position_encoding_table = nullptr; + + /* + prompt_learning_pair = vectors of [weight ptr, prompt length] pair + prompt_length is stored here for compatible prompt learning table + prefix_prompt weights store as shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head] + p/prompt tuning weights store as shape [prompt_len, hidden_units] + idx is the task_name_id of the prompt tables + */ + std::vector> prompt_learning_table = {}; + + LayerNormWeight post_decoder_layernorm; + DenseWeight post_decoder_embedding; + + inline void setMaxSeqLen(size_t max_seq_len) + { + max_seq_len_ = max_seq_len; + } + +private: + void setWeightPtr(); + void mallocWeights(); + bool isValidLayerParallelId(int l); + + int hidden_units_; + int inter_size_; + int vocab_size_; + int num_layer_; + int max_seq_len_; + + int tensor_para_size_; + int tensor_para_rank_; + int layer_para_size_; + int layer_para_rank_; + + // residual type + bool use_gptj_residual_; + + // prompt learning pair (task_name, (task_name_id, prompt_len)) + PromptLearningType prompt_learning_type_; + std::map> prompt_learning_pair_; + bool malloc_load_prompt_weights_ = false; + // each prompt token's weight size + size_t prompt_token_weight_size_ = 0; + + bool is_maintain_buffer = false; + const size_t num_base_weights = 4; + std::vector weights_ptr = std::vector(num_base_weights); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/triton_backend/CMakeLists.txt b/src/fastertransformer/triton_backend/CMakeLists.txt index 0079e087a..9620cc1a3 100644 --- a/src/fastertransformer/triton_backend/CMakeLists.txt +++ b/src/fastertransformer/triton_backend/CMakeLists.txt @@ -26,3 +26,5 @@ if (ENABLE_FP8) add_subdirectory(multi_gpu_gpt_fp8) endif() add_subdirectory(bert) + +# add_subdirectory(llama) From a32fc1def50b2d07f8f131e306bdfb1c005ce0b8 Mon Sep 17 00:00:00 2001 From: void-main Date: Mon, 24 Apr 2023 07:32:13 +0000 Subject: [PATCH 02/21] make the code work :yay: --- examples/cpp/llama/check_with_huggingface.py | 15 +++++++------ .../cpp/llama/huggingface_llama_convert.py | 22 +++++++++---------- examples/cpp/llama/llama_example.cc | 2 +- .../models/llama/CMakeLists.txt | 4 ++-- src/fastertransformer/models/llama/Llama.h | 4 ++-- .../models/llama/LlamaContextDecoder.cc | 7 +++--- .../models/llama/LlamaDecoder.cc | 7 +++--- .../models/llama/LlamaDecoderLayerWeight.cc | 4 ++-- .../models/llama/LlamaWeight.h | 2 +- 9 files changed, 33 insertions(+), 34 deletions(-) diff --git a/examples/cpp/llama/check_with_huggingface.py b/examples/cpp/llama/check_with_huggingface.py index 0ba69036e..d1f356cc1 100644 --- a/examples/cpp/llama/check_with_huggingface.py +++ b/examples/cpp/llama/check_with_huggingface.py @@ -1,15 +1,16 @@ import transformers -import torch from transformers import LlamaForCausalLM, LlamaTokenizer tokenizer = LlamaTokenizer.from_pretrained('/data/llama-7b-hf') -prompt = "Hey" -inputs = tokenizer(prompt, return_tensors='pt') -print(inputs) +prompt = "Hey, are you consciours? Can you talk to me?" +inputs = tokenizer(prompt, return_tensors='pt') model = LlamaForCausalLM.from_pretrained("/data/llama-7b-hf") -generated_ids = model.generate(inputs.input_ids, max_length=10) +hf_config = vars(model.config) +print(hf_config) +generated_ids = model.forward(inputs.input_ids, output_hidden_states=True) print(generated_ids) -output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] -print(output) + +tokens = [0,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366] +print(tokenizer.decode(tokens)) diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py index 28a552a28..37868d0bd 100644 --- a/examples/cpp/llama/huggingface_llama_convert.py +++ b/examples/cpp/llama/huggingface_llama_convert.py @@ -119,31 +119,31 @@ def split_and_convert(args): print(f"converting layer {l}") # first merge QKV into a single weight # concat direct to FT shape: [hidden_size, 3, head_num, head_size] - qkv_weights = np.empty((hidden_size, 3, head_num, head_size), dtype=np_weight_data_type) - q_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight']) - k_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight']) - v_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight']) - qkv_weights[:, 0, :, :] = q_weight.reshape(hidden_size, head_num, head_size) - qkv_weights[:, 1, :, :] = k_weight.reshape(hidden_size, head_num, head_size) - qkv_weights[:, 2, :, :] = v_weight.reshape(hidden_size, head_num, head_size) + # copied from huggingface_gptj_ckpt_convert.py + qkv_weights = np.stack([ + param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight']), + param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight']), + param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight']), + ]) + qkv_weights = np.transpose(qkv_weights, (2, 0, 1)) qkv_weights_base_name = f'model.layers.{l}.attention.query_key_value.weight' split_and_convert_process(saved_dir, factor, qkv_weights_base_name, qkv_weights) # attention dense - o_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']) + o_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']).T o_weight_base_name = f'model.layers.{l}.attention.dense.weight' split_and_convert_process(saved_dir, factor, o_weight_base_name, o_weight) # MLP - mlp_down_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']) + mlp_down_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']).T mlp_down_base_name = f'model.layers.{l}.mlp.down_proj.weight' split_and_convert_process(saved_dir, factor, mlp_down_base_name, mlp_down_weight) - mlp_gate_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']) + mlp_gate_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']).T mlp_gate_base_name = f'model.layers.{l}.mlp.gate_proj.weight' split_and_convert_process(saved_dir, factor, mlp_gate_base_name, mlp_gate_weight) - mlp_up_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']) + mlp_up_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']).T mlp_up_base_name = f'model.layers.{l}.mlp.up_proj.weight' split_and_convert_process(saved_dir, factor, mlp_up_base_name, mlp_up_weight) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index c72a7a8b7..5672fc2cd 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -263,7 +263,7 @@ void llama_example(const INIReader reader) cublas_wrapper.setFP32GemmConfig(); } - const bool use_gptj_residual = (bool)reader.GetInteger(model_name, "use_gptj_residual", 1); + const bool use_gptj_residual = false; fastertransformer::LlamaWeight gpt_weights(hidden_units, inter_size, vocab_size, diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt index ec836068d..88ad2ad42 100644 --- a/src/fastertransformer/models/llama/CMakeLists.txt +++ b/src/fastertransformer/models/llama/CMakeLists.txt @@ -24,7 +24,7 @@ set_property(TARGET LlamaDecoder PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET LlamaDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(LlamaDecoder PUBLIC -lcudart cublasMMWrapper TensorParallelDecoderSelfAttentionLayer - TensorParallelGeluFfnLayer + TensorParallelSiluFfnLayer layernorm_kernels add_residual_kernels LlamaDecoderLayerWeight @@ -38,7 +38,7 @@ set_property(TARGET LlamaContextDecoder PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET LlamaContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(LlamaContextDecoder PUBLIC -lcudart cublasMMWrapper TensorParallelGptContextAttentionLayer - TensorParallelGeluFfnLayer + TensorParallelSiluFfnLayer layernorm_kernels add_residual_kernels gpt_kernels diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h index cd79ac0b8..8ba6a88df 100644 --- a/src/fastertransformer/models/llama/Llama.h +++ b/src/fastertransformer/models/llama/Llama.h @@ -40,7 +40,7 @@ class Llama: public BaseLayer { size_t rotary_embedding_dim_; static constexpr bool neox_rotary_style_ = true; - static constexpr float layernorm_eps_ = 1e-5f; + static constexpr float layernorm_eps_ = 1e-6f; int start_id_; int end_id_; @@ -61,7 +61,7 @@ class Llama: public BaseLayer { std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); // Residual Type - const bool use_gptj_residual_ = true; + const bool use_gptj_residual_ = false; // Prompt Learning Parameters PromptLearningType prompt_learning_type_; diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc index f107d38c1..8ea1494fb 100644 --- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc @@ -18,7 +18,7 @@ #include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/fastertransformer/kernels/gpt_kernels.h" -#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h" +#include "src/fastertransformer/layers/TensorParallelSiluFfnLayer.h" #include "src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.h" namespace fastertransformer { @@ -44,7 +44,7 @@ void LlamaContextDecoder::initialize() custom_all_reduce_comm_, enable_custom_all_reduce_); - ffn_layer_ = new TensorParallelGeluFfnLayer(0, // max_batch_size + ffn_layer_ = new TensorParallelSiluFfnLayer(0, // max_batch_size 0, // max_seq_len head_num_, size_per_head_, @@ -57,7 +57,6 @@ void LlamaContextDecoder::initialize() !use_gptj_residual_, is_free_buffer_after_forward_, false, - 0, true, // use_gated_activation = true; custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -333,7 +332,7 @@ void LlamaContextDecoder::forward(std::unordered_map* invokeGeneralT5LayerNorm(decoder_normed_input_, layer_input, gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, - gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + (const T*)nullptr, layernorm_eps_, h_token_num, hidden_units_, diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc index d5fb58fee..23e9b0eec 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaDecoder.cc @@ -15,7 +15,7 @@ */ #include "src/fastertransformer/models/llama/LlamaDecoder.h" -#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h" +#include "src/fastertransformer/layers/TensorParallelSiluFfnLayer.h" #include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h" namespace fastertransformer { @@ -39,7 +39,7 @@ void LlamaDecoder::initialize() custom_all_reduce_comm_, enable_custom_all_reduce_); - ffn_layer_ = new TensorParallelGeluFfnLayer(0, // max_batch_size + ffn_layer_ = new TensorParallelSiluFfnLayer(0, // max_batch_size 1, head_num_, size_per_head_, @@ -52,7 +52,6 @@ void LlamaDecoder::initialize() !use_gptj_residual_, is_free_buffer_after_forward_, false, - 0, true, // use_gated_activation = true; custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -264,7 +263,7 @@ void LlamaDecoder::forward(std::unordered_map* invokeGeneralT5LayerNorm(decoder_normed_input_, layer_input, gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, - gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta, + (const T*)nullptr, layernorm_eps_, local_batch_size, hidden_units_, diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc index ecd539a75..39a1d5c1f 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc @@ -154,13 +154,13 @@ void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType // FIXME(sunpeng17): check if the weights are correct loadWeightFromBin(weights_ptr[6], {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, - dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin", + dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin", model_file_type); deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); loadWeightFromBin(weights_ptr[8], {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, - dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin", + dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin", model_file_type); deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h index 2f5b2632a..ec909ca49 100644 --- a/src/fastertransformer/models/llama/LlamaWeight.h +++ b/src/fastertransformer/models/llama/LlamaWeight.h @@ -37,7 +37,7 @@ struct LlamaWeight { const int tensor_para_rank = 0, const int layer_para_size = 1, const int layer_para_rank = 0, - const bool use_gptj_residual_ = true, + const bool use_gptj_residual_ = false, PromptLearningType prompt_learning_type = PromptLearningType::no_prompt, std::map> prompt_learning_pair = std::map>{}); From ce8700fa866b3b2616d8788e6005be731afc9377 Mon Sep 17 00:00:00 2001 From: void-main Date: Mon, 24 Apr 2023 09:30:41 +0000 Subject: [PATCH 03/21] fix llama rms ln --- src/fastertransformer/models/llama/Llama.cc | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 0091fce48..d0305e84a 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -843,16 +843,14 @@ void Llama::forward(std::unordered_map* output_ten } if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { - invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset, - decoder_output_buf_ + hidden_units_offset, - gpt_weights->post_decoder_layernorm.gamma, - gpt_weights->post_decoder_layernorm.beta, - layernorm_eps_, - local_batch_size * beam_width, - hidden_units_, - (float*)nullptr, - 0, - stream_); + invokeGeneralT5LayerNorm(normed_decoder_output_buf_ + hidden_units_offset, + decoder_output_buf_ + hidden_units_offset, + gpt_weights->post_decoder_layernorm.gamma, + (const T*)nullptr, + layernorm_eps_, + local_batch_size * beam_width, + hidden_units_, + stream_); sync_check_cuda_error(); if (tensor_para_.world_size_ == 1) { From 91989cb6fdf84e048247ffa4396f7ac3f574254d Mon Sep 17 00:00:00 2001 From: void-main Date: Tue, 25 Apr 2023 03:08:37 +0000 Subject: [PATCH 04/21] add bf16 support --- examples/cpp/llama/llama_example.cc | 5 +++++ src/fastertransformer/models/llama/Llama.cc | 4 ++++ src/fastertransformer/models/llama/LlamaContextDecoder.cc | 4 ++++ src/fastertransformer/models/llama/LlamaDecoder.cc | 4 ++++ .../models/llama/LlamaDecoderLayerWeight.cc | 4 ++++ src/fastertransformer/models/llama/LlamaWeight.cc | 4 ++++ 6 files changed, 25 insertions(+) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 5672fc2cd..0f8e4a5aa 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -60,6 +60,11 @@ int main(int argc, char* argv[]) else if (data_type == "fp16") { llama_example(reader); } +#ifdef ENABLE_BF16 + else if (data_type == "bf16") { + llama_example<__nv_bfloat16>(reader); + } +#endif else { FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half)."); return -1; diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index d0305e84a..0f547d2c5 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1206,4 +1206,8 @@ bool* Llama::getFinishBuffer() template class Llama; template class Llama; +#ifdef ENABLE_BF16 +template class Llama<__nv_bfloat16>; +#endif + } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc index 8ea1494fb..900e1f016 100644 --- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc @@ -500,4 +500,8 @@ void LlamaContextDecoder::forward(std::unordered_map* template class LlamaContextDecoder; template class LlamaContextDecoder; +#ifdef ENABLE_BF16 +template class LlamaContextDecoder<__nv_bfloat16>; +#endif + } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc index 23e9b0eec..dd9a51cee 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaDecoder.cc @@ -377,4 +377,8 @@ void LlamaDecoder::forward(std::unordered_map* template class LlamaDecoder; template class LlamaDecoder; +#ifdef ENABLE_BF16 +template class LlamaDecoder<__nv_bfloat16>; +#endif + } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc index 39a1d5c1f..dc99451b9 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc @@ -222,4 +222,8 @@ void LlamaDecoderLayerWeight::mallocWeights() template struct LlamaDecoderLayerWeight; template struct LlamaDecoderLayerWeight; +#ifdef ENABLE_BF16 +template class LlamaDecoderLayerWeight<__nv_bfloat16>; +#endif + } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc index 65c2f762d..01741e4f2 100644 --- a/src/fastertransformer/models/llama/LlamaWeight.cc +++ b/src/fastertransformer/models/llama/LlamaWeight.cc @@ -298,4 +298,8 @@ bool LlamaWeight::isValidLayerParallelId(int l) template struct LlamaWeight; template struct LlamaWeight; +#ifdef ENABLE_BF16 +template class LlamaWeight<__nv_bfloat16>; +#endif + } // namespace fastertransformer From 4bc97c333cb60671547de71de30c5aad0c76eae1 Mon Sep 17 00:00:00 2001 From: void-main Date: Tue, 25 Apr 2023 04:56:41 +0000 Subject: [PATCH 05/21] add triton model for streaming callback --- examples/cpp/llama/CMakeLists.txt | 8 +- examples/cpp/llama/llama_config.ini | 2 +- examples/cpp/llama/llama_example.cc | 1 - examples/cpp/llama/llama_triton_example.cc | 73 +++-- .../triton_backend/CMakeLists.txt | 2 +- .../triton_backend/llama/CMakeLists.txt | 25 ++ .../triton_backend/llama/LlamaTritonModel.cc | 253 +++++++++++++++++ .../triton_backend/llama/LlamaTritonModel.h | 82 ++++++ .../llama/LlamaTritonModelInstance.cc | 265 ++++++++++++++++++ .../llama/LlamaTritonModelInstance.h | 82 ++++++ .../transformer_triton_backend.hpp | 1 + 11 files changed, 746 insertions(+), 48 deletions(-) create mode 100644 src/fastertransformer/triton_backend/llama/CMakeLists.txt create mode 100644 src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc create mode 100644 src/fastertransformer/triton_backend/llama/LlamaTritonModel.h create mode 100644 src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc create mode 100644 src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt index 0495d3bf2..cdf9033dd 100644 --- a/examples/cpp/llama/CMakeLists.txt +++ b/examples/cpp/llama/CMakeLists.txt @@ -16,7 +16,7 @@ add_executable(llama_example llama_example.cc) target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart Llama nvtx_utils gpt_example_utils word_list mpi_utils nccl_utils) -# add_executable(llama_triton_example llama_triton_example.cc) -# target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread -# LlamaTritonBackend TransformerTritonBackend custom_ar_comm -# gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils) +add_executable(llama_triton_example llama_triton_example.cc) +target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread + LlamaTritonBackend TransformerTritonBackend custom_ar_comm + gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 5882e03a4..6fa7117ca 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -6,7 +6,7 @@ tensor_para_size=1 pipeline_para_size=1 model_name=llama_7b -model_dir=/data/llama-7b-hf-converted +model_dir=/data/llama-7b-hf-converted/1-gpu [request] beam_width=1 # beam width for beam search diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 0f8e4a5aa..5e84e13d9 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -282,7 +282,6 @@ void llama_example(const INIReader reader) prompt_learning_type, prefix_prompt_table_pair); - model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu"; gpt_weights.loadModel(model_dir); unsigned long long random_seed; if (rank == 0) { diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 1840035a2..e639ae9ca 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -118,7 +118,7 @@ broadCastRequest(const std::vector& v_start_ids, pointer_record->push_back(start_ids_ptr); pointer_record->push_back(end_ids_ptr); - std::shared_ptr> input_tensors( + request_list.push_back(std::shared_ptr>( new std::unordered_map{ {"input_ids", triton::Tensor{triton::MEMORY_GPU, @@ -130,25 +130,18 @@ broadCastRequest(const std::vector& v_start_ids, triton::TYPE_INT32, std::vector{(size_t)request_batch_size}, d_input_lengths}}, - // NOTE: add prefix prompt task ids here if you need - // {"prefix_prompt_task_ids", triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, - // std::vector{request_batch_size}, task_name_ids}}, {"request_output_len", triton::Tensor{triton::MEMORY_CPU, - triton::TYPE_UINT32, + triton::TYPE_INT32, std::vector{(size_t)request_batch_size}, request_output_len_ptr}}, + {"bad_words_list", + triton::Tensor{ + triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}}, {"start_id", triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, start_ids_ptr}}, {"end_id", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}}); - if (!v_input_bad_words.empty()) { - input_tensors->insert( - {"bad_words_list", - triton::Tensor{ - triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}}); - } - request_list.push_back(input_tensors); + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}})); int* beam_width_ptr = new int(param.beam_width); pointer_record->push_back(beam_width_ptr); @@ -234,10 +227,10 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std ft::FT_CHECK(false); } - const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); - const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name"); - const int start_id = reader.GetInteger(model_name, "start_id"); - const int end_id = reader.GetInteger(model_name, "end_id"); + const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); + + const int start_id = reader.GetInteger("llama_7b", "start_id"); + const int end_id = reader.GetInteger("llama_7b", "end_id"); std::vector v_start_ids; std::vector v_start_lengths; @@ -249,22 +242,22 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std max_input_len, end_id, 1, - "../examples/cpp/gptj/start_ids.csv"); + "../examples/cpp/llama/start_ids.csv"); std::vector v_bad_words; - ft::read_word_list("../examples/cpp/gptj/bad_words.csv", v_bad_words); + ft::read_word_list("../examples/cpp/llama/bad_words.csv", v_bad_words); RequestParam param; - param.beam_width = reader.GetInteger("ft_instance_hyperparameter", "beam_width"); + param.beam_width = reader.GetInteger("request", "beam_width"); param.request_output_len = reader.GetInteger("request", "request_output_len"); - param.beam_search_diversity_rate = reader.GetFloat("ft_instance_hyperparameter", "beam_search_diversity_rate"); - param.runtime_top_k = (uint)reader.GetInteger("ft_instance_hyperparameter", "top_k"); - param.runtime_top_p = reader.GetFloat("ft_instance_hyperparameter", "top_p"); - param.temperature = reader.GetFloat("ft_instance_hyperparameter", "temperature"); - param.len_penalty = reader.GetFloat("ft_instance_hyperparameter", "len_penalty"); - param.repetition_penalty = reader.GetFloat("ft_instance_hyperparameter", "repetition_penalty", 1.0f); - param.presence_penalty = reader.GetFloat("ft_instance_hyperparameter", "presence_penalty", 0.0f); - param.min_length = reader.GetInteger("ft_instance_hyperparameter", "min_length", 0); + param.beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate"); + param.runtime_top_k = reader.GetInteger("request", "top_k"); + param.runtime_top_p = reader.GetFloat("request", "top_p"); + param.temperature = reader.GetFloat("request", "temperature"); + param.len_penalty = reader.GetFloat("request", "len_penalty"); + param.repetition_penalty = reader.GetFloat("request", "repetition_penalty", 1.0f); + param.presence_penalty = reader.GetFloat("request", "presence_penalty", 0.0f); + param.min_length = reader.GetInteger("request", "min_length", 0); param.random_seed = (unsigned long long int)0; param.start_id = start_id; param.end_id = end_id; @@ -310,18 +303,20 @@ int main(int argc, char* argv[]) by MPI or triton */ + MPICHECK(MPI_Init(&argc, &argv)); ft::mpi::initialize(&argc, &argv); int node_id = ft::mpi::getCommWorldRank(); int node_num = ft::mpi::getCommWorldSize(); + std::cout << "node_id: " << node_id << ", node_num: " << node_num << std::endl; // Note: Only supports that all nodes have same gpu count const int gpu_count = ft::getDeviceCount(); const int world_size = node_num * gpu_count; - std::string ini_name = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/gptj/gptj_config.ini"; + std::string ini_name = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/llama/llama_config.ini"; // step 1: Create model - std::shared_ptr model = AbstractTransformerModel::createGptJModel(ini_name); - int tensor_para_size = model->getTensorParaSize(); + std::shared_ptr model = AbstractTransformerModel::createLlamaModel(ini_name); + int tensor_para_size = model->getTensorParaSize(); int pipeline_para_size = model->getPipelineParaSize(); FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size), "World Size != Tensor Parallel Size * Pipeline Parallel Size !"); @@ -329,7 +324,7 @@ int main(int argc, char* argv[]) std::cout << model->toString(); // step 2: Initialize the NCCL - std::pair, std::vector> nccl_params = model->createNcclParams(node_id); + std::pair, std::vector> nccl_comms = model->createNcclParams(node_id); cudaDeviceSynchronize(); // Optional Step: create custom all reduce comm @@ -346,7 +341,7 @@ int main(int argc, char* argv[]) &model_instances, device_id, rank, - nccl_params, + nccl_comms, custom_all_reduce_comms[rank])); } for (auto& t : threads) { @@ -398,20 +393,16 @@ int main(int argc, char* argv[]) std::cout << "Writing " << outCount << " elements\n"; int zeroCount = 0; for (size_t i = 0; i < outCount; i++) { - if (hBuf[i] == int(0)) { + if (hBuf[i] == int(0)) zeroCount++; - } outFile << hBuf[i] << " "; - if ((i + 1) % (seq_len) == 0) { + if ((i + 1) % (seq_len) == 0) outFile << std::endl; - } - if (i < 10) { + if (i < 10) printf("%5d ", hBuf[i]); - } - if ((i + 1) % (seq_len) == 0 && i < 10) { + if ((i + 1) % (seq_len) == 0 && i < 10) std::cout << std::endl; - } } std::cout << std::endl << "zeroCount = " << zeroCount << std::endl; } diff --git a/src/fastertransformer/triton_backend/CMakeLists.txt b/src/fastertransformer/triton_backend/CMakeLists.txt index 9620cc1a3..63f3526da 100644 --- a/src/fastertransformer/triton_backend/CMakeLists.txt +++ b/src/fastertransformer/triton_backend/CMakeLists.txt @@ -27,4 +27,4 @@ if (ENABLE_FP8) endif() add_subdirectory(bert) -# add_subdirectory(llama) +add_subdirectory(llama) diff --git a/src/fastertransformer/triton_backend/llama/CMakeLists.txt b/src/fastertransformer/triton_backend/llama/CMakeLists.txt new file mode 100644 index 000000000..d5ba5547e --- /dev/null +++ b/src/fastertransformer/triton_backend/llama/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.8) + +set(parallel_gpt_triton_backend_files + LlamaTritonModel.cc + LlamaTritonModelInstance.cc +) + +add_library(LlamaTritonBackend STATIC ${parallel_gpt_triton_backend_files}) +set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON) +target_link_libraries(LlamaTritonBackend PRIVATE TransformerTritonBackend Llama tensor memory_utils -lcublasLt) +target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14) diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc new file mode 100644 index 000000000..02cfc6e9e --- /dev/null +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" +#include "3rdparty/INIReader.h" +#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" +#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" +#include "src/fastertransformer/utils/allocator.h" + +namespace ft = fastertransformer; + +std::shared_ptr AbstractTransformerModel::createLlamaModel(std::string inifile) +{ + INIReader reader = INIReader(inifile); + if (reader.ParseError() < 0) { + std::cout << "[ERROR] Can't load '" << inifile << "'\n"; + return nullptr; + } + + const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type"); + int tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"); + std::string model_dir = reader.Get("ft_instance_hyperparameter", "model_dir"); + + if (data_type == "half") { + return std::make_shared>( + reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), + model_dir); + } + if (data_type == "bf16") { + return std::make_shared>( + reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), + model_dir); + } + else { + return std::make_shared>( + reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), + model_dir); + } +} + +template +LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, + size_t pipeline_para_size, + int enable_custom_all_reduce, + std::string model_dir): + tensor_para_size_(tensor_para_size), + pipeline_para_size_(pipeline_para_size), + shared_weights_(std::vector>>(ft::getDeviceCount())), + enable_custom_all_reduce_(enable_custom_all_reduce) +{ + model_dir_ = model_dir; + const std::string inifile{model_dir + "/config.ini"}; + INIReader reader = INIReader(inifile); + if (reader.ParseError() < 0) { + std::cout << "[ERROR] Can't load '" << inifile << "'\n"; + ft::FT_CHECK(false); + } + + model_name_ = reader.Get("llama", "model_name"); + head_num_ = reader.GetInteger("llama", "head_num"); + size_per_head_ = reader.GetInteger("llama", "size_per_head"); + inter_size_ = reader.GetInteger("llama", "inter_size"); + num_layer_ = reader.GetInteger("llama", "num_layer"); + vocab_size_ = reader.GetInteger("llama", "vocab_size"); + rotary_embedding_dim_ = reader.GetInteger("llama", "rotary_embedding"); + start_id_ = reader.GetInteger("llama", "start_id"); + end_id_ = reader.GetInteger("llama", "end_id"); + use_gptj_residual_ = false; + + num_tasks_ = reader.GetInteger("llama", "num_tasks", 0); + + prompt_learning_start_id_ = reader.GetInteger("llama", "prompt_learning_start_id", end_id_ + 1); + prompt_learning_type_ = + static_cast(reader.GetInteger("llama", "prompt_learning_type", 0)); + + for (int task_name_id = 0; task_name_id < num_tasks_; task_name_id++) { + std::string config_task_name = "task_" + std::to_string(task_name_id); + std::string task_name = reader.Get(config_task_name, "task_name"); + const int prompt_length = reader.GetInteger(config_task_name, "prompt_length", 0); + prompt_learning_table_pair_.insert({task_name, {task_name_id, prompt_length}}); + } +} + +template +std::unique_ptr LlamaTritonModel::createModelInstance( + int device_id, + int rank, + cudaStream_t stream, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm) +{ + ft::check_cuda_error(cudaSetDevice(device_id)); + const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_); + + std::unique_ptr> allocator( + new ft::Allocator(device_id)); + + allocator->setStream(stream); + + cublasHandle_t cublas_handle; + cublasLtHandle_t cublaslt_handle; + + cublasCreate(&cublas_handle); + cublasLtCreate(&cublaslt_handle); + cublasSetStream(cublas_handle, stream); + + std::unique_ptr cublas_algo_map(new ft::cublasAlgoMap("gemm_config.in")); + std::unique_ptr cublas_wrapper_mutex(new std::mutex()); + std::unique_ptr cublas_wrapper(new ft::cublasMMWrapper( + cublas_handle, cublaslt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get())); + + std::unique_ptr cuda_device_prop_ptr(new cudaDeviceProp); + ft::check_cuda_error(cudaGetDeviceProperties(cuda_device_prop_ptr.get(), device_id)); + + if (std::is_same::value) { + cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); + } + else if (std::is_same::value) { + cublas_wrapper->setFP32GemmConfig(); + } + + ft::NcclParam tensor_para = nccl_params.first[comms_rank]; + ft::NcclParam pipeline_para = nccl_params.second[comms_rank]; + + ft::AttentionType attention_type = ft::getAttentionType(size_per_head_, + ft::getSMVersion(), + true, // remove_padding + 0, // gpt supports any-seq-length fmha + true, // is_fuse + false, // with_relative_position_bias + true); // causal_mask + auto gpt = std::make_unique>( + ft::Llama(head_num_, + size_per_head_, + inter_size_, + num_layer_, + vocab_size_, + rotary_embedding_dim_, + start_id_, + end_id_, + prompt_learning_start_id_, // p/prompt tuning virtual token start id + prompt_learning_type_, + use_gptj_residual_, + 0.0f, // beam_search_diversity_rate_, + 0, // top_k_, + 0.0f, // top_p_, + 0, // random seed, note that all gpus should use same seed + 0.0f, // temperature_, + 0.0f, // len_penalty_, + 0.0f, // repetition_penalty_, + tensor_para, + pipeline_para, + stream, + cublas_wrapper.get(), + allocator.get(), + false, + cuda_device_prop_ptr.get(), + attention_type, + custom_all_reduce_comm, + enable_custom_all_reduce_)); + + return std::unique_ptr>( + new LlamaTritonModelInstance(std::move(gpt), + shared_weights_[device_id], + std::move(allocator), + std::move(cublas_algo_map), + std::move(cublas_wrapper_mutex), + std::move(cublas_wrapper), + std::move(cuda_device_prop_ptr))); +} + +template +void LlamaTritonModel::createSharedWeights(int device_id, int rank) +{ + ft::check_cuda_error(cudaSetDevice(device_id)); + const int tensor_para_rank = rank % tensor_para_size_; + const int pipeline_para_rank = rank / tensor_para_size_; + shared_weights_[device_id] = std::make_shared>(head_num_ * size_per_head_, + inter_size_, + vocab_size_, + num_layer_, + 0, // max_seq_len, deprecated + tensor_para_size_, + tensor_para_rank, + pipeline_para_size_, + pipeline_para_rank, + use_gptj_residual_, + prompt_learning_type_, + prompt_learning_table_pair_); + shared_weights_[device_id]->loadModel(model_dir_); + return; +} + +template +std::string LlamaTritonModel::toString() +{ + std::stringstream ss; + ss << "Model: " + << "\nhead_num: " << head_num_ << "\nsize_per_head: " << size_per_head_ << "\ninter_size: " << inter_size_ + << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nstart_id: " << start_id_ + << "\nend_id: " << end_id_ << "\nuse_gptj_residual: " << use_gptj_residual_ + << "\nprompt_learning_type_: " << static_cast(prompt_learning_type_) + << "\nprompt_learning_start_id_: " << prompt_learning_start_id_ << "\ntensor_para_size: " << tensor_para_size_ + << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ + << "\nmodel_name: " << model_name_ << "\nmodel_dir: " << model_dir_ << std::endl; + return ss.str(); +} + +template +void LlamaTritonModel::createCustomComms( + std::vector>* custom_all_reduce_comms, int world_size) +{ + using commDataType = typename ft::CustomARCommTypeConverter::Type; + ft::initCustomAllReduceComm(custom_all_reduce_comms, enable_custom_all_reduce_, world_size); +} + +template +int LlamaTritonModel::getTensorParaSize() +{ + return tensor_para_size_; +} + +template +int LlamaTritonModel::getPipelineParaSize() +{ + return pipeline_para_size_; +} + +template struct LlamaTritonModel; +template struct LlamaTritonModel; + +#ifdef ENABLE_BF16 +template struct LlamaTritonModel<__nv_bfloat16>; +#endif diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h new file mode 100644 index 000000000..0775ed05f --- /dev/null +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "src/fastertransformer/models/llama/Llama.h" +#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" +#include "src/fastertransformer/utils/cuda_utils.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" +#include "src/fastertransformer/utils/nccl_utils.h" +#include + +namespace ft = fastertransformer; + +template +struct LlamaTritonModel: public AbstractTransformerModel { + LlamaTritonModel(size_t tensor_para_size, + size_t pipeline_para_size, + int enable_custom_all_reduce, + std::string model_dir); + + ~LlamaTritonModel() = default; + + virtual std::unique_ptr + createModelInstance(int deviceId, + int rank, + cudaStream_t stream, + std::pair, std::vector> nccl_params, + std::shared_ptr custom_all_reduce_comm = nullptr) override; + + virtual void createSharedWeights(int deviceId, int rank) override; + + virtual void createCustomComms(std::vector>* custom_all_reduce_comms, + int world_size) override; + + virtual std::string toString() override; + virtual int getTensorParaSize() override; + virtual int getPipelineParaSize() override; + +private: + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t vocab_size_; + size_t rotary_embedding_dim_; + int start_id_; + int end_id_; + size_t tensor_para_size_; + size_t pipeline_para_size_; + + // shared weights for each device + std::vector>> shared_weights_; + + // residual type + bool use_gptj_residual_ = false; + + // number of tasks (for prefix-prompt, p/prompt-tuning) + size_t num_tasks_ = 0; + int prompt_learning_start_id_ = 0; + ft::PromptLearningType prompt_learning_type_ = ft::PromptLearningType::no_prompt; + std::map> prompt_learning_table_pair_ = {}; + + bool is_fp16_; + int enable_custom_all_reduce_ = 0; + + std::string model_name_; + std::string model_dir_; +}; diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc new file mode 100644 index 000000000..4e6f841f9 --- /dev/null +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" +#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" +#include "src/fastertransformer/triton_backend/triton_utils.hpp" +#include "src/fastertransformer/utils/Tensor.h" +#include +#include +#include +#include + +namespace ft = fastertransformer; + +template +void triton_stream_callback(std::unordered_map* output_tensors, void* ctx) +{ + LlamaTritonModelInstance* model = reinterpret_cast*>(ctx); + auto result = LlamaTritonModelInstance::convert_outputs(*output_tensors); + + model->stream_cb_(result, model->stream_ctx_); +} + +template +LlamaTritonModelInstance::LlamaTritonModelInstance( + std::unique_ptr> gpt, + std::shared_ptr> gpt_weight, + std::unique_ptr> allocator, + std::unique_ptr cublas_algo_map, + std::unique_ptr cublas_wrapper_mutex, + std::unique_ptr cublas_wrapper, + std::unique_ptr cuda_device_prop_ptr): + gpt_(std::move(gpt)), + gpt_weight_(gpt_weight), + allocator_(std::move(allocator)), + cublas_algo_map_(std::move(cublas_algo_map)), + cublas_wrapper_mutex_(std::move(cublas_wrapper_mutex)), + cublas_wrapper_(std::move(cublas_wrapper)), + cuda_device_prop_ptr_(std::move(cuda_device_prop_ptr)) +{ +} + +template +std::unordered_map LlamaTritonModelInstance::convert_inputs( + std::shared_ptr> input_tensors) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + + move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_); + move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_); + + const size_t request_batch_size = input_tensors->at("input_ids").shape[0]; + const size_t input_data_len = input_tensors->at("input_ids").shape[1]; + h_total_output_lengths_ = reinterpret_cast(malloc(request_batch_size * sizeof(uint32_t))); + for (int i = 0; i < request_batch_size; ++i) { + h_total_output_lengths_[i] = + reinterpret_cast(input_tensors->at("request_output_len").data)[i] + input_data_len; + } + + std::unordered_map ft_input_tensors = std::unordered_map{ + {"input_ids", as_GPU_tensor(input_tensors->at("input_ids"), d_input_ids_)}, + {"input_lengths", as_GPU_tensor(input_tensors->at("input_lengths"), d_input_lengths_)}, + {"output_seq_len", + ft::Tensor{ft::MEMORY_CPU, + ft::TYPE_UINT32, + {input_tensors->at("request_output_len").shape[0]}, + h_total_output_lengths_}}}; + + if (input_tensors->find("bad_words_list") != input_tensors->end()) { + move_tensor_H2D(input_tensors->at("bad_words_list"), d_input_bad_words_, &allocator_); + ft_input_tensors.insert( + {"bad_words_list", as_GPU_tensor(input_tensors->at("bad_words_list"), d_input_bad_words_)}); + } + + if (input_tensors->find("stop_words_list") != input_tensors->end()) { + move_tensor_H2D(input_tensors->at("stop_words_list"), d_input_stop_words_, &allocator_); + ft_input_tensors.insert( + {"stop_words_list", as_GPU_tensor(input_tensors->at("stop_words_list"), d_input_stop_words_)}); + } + + if (input_tensors->count("request_prompt_embedding") && input_tensors->count("request_prompt_lengths") + && input_tensors->count("request_prompt_type")) { + + move_tensor_H2D(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_, &allocator_); + ft_input_tensors.insert( + {"request_prompt_lengths", + as_GPU_tensor(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_)}); + + move_tensor_H2D(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_, &allocator_); + ft_input_tensors.insert( + {"request_prompt_embedding", + as_GPU_tensor(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_)}); + } + + if (input_tensors->find("top_p_decay") != input_tensors->end()) { + move_tensor_H2D(input_tensors->at("top_p_decay"), d_top_p_decay_, &allocator_); + ft_input_tensors.insert({"top_p_decay", as_GPU_tensor(input_tensors->at("top_p_decay"), d_top_p_decay_)}); + } + if (input_tensors->find("top_p_min") != input_tensors->end()) { + move_tensor_H2D(input_tensors->at("top_p_min"), d_top_p_min_, &allocator_); + ft_input_tensors.insert({"top_p_min", as_GPU_tensor(input_tensors->at("top_p_min"), d_top_p_min_)}); + } + if (input_tensors->find("top_p_reset_ids") != input_tensors->end()) { + move_tensor_H2D(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_, &allocator_); + ft_input_tensors.insert( + {"top_p_reset_ids", as_GPU_tensor(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_)}); + } + + for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) { + if (t->first.find("input_ids") == std::string::npos && t->first.find("input_lengths") == std::string::npos + && t->first.find("output_seq_len") == std::string::npos + && t->first.find("prefix_soft_prompt_embedding") == std::string::npos + && t->first.find("prefix_soft_prompt_lengths") == std::string::npos) { + if (ft_input_tensors.count(t->first) == 0) { + ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()}); + } + } + } + + return ft_input_tensors; +} + +template +std::shared_ptr> +LlamaTritonModelInstance::convert_outputs(const std::unordered_map& output_tensors) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + std::unordered_map* outputs_mapping = + new std::unordered_map(); + + for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) { + outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)}); + } + + return std::shared_ptr>(outputs_mapping); +} + +template +std::shared_ptr> +LlamaTritonModelInstance::forward(std::shared_ptr> input_tensors) +{ + ft::FT_CHECK(false); + return nullptr; +} + +template +std::shared_ptr> +LlamaTritonModelInstance::forward(std::shared_ptr> input_tensors) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2, + "input_tensors->at(\"input_ids\").shape.size() == 2"); + FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1, + "input_tensors->at(\"input_lengths\").shape.size() == 1"); + + const uint32_t request_batch_size = input_tensors->at("input_ids").shape[0]; + const uint32_t max_request_output_len = (size_t)*std::max_element( + (int*)input_tensors->at("request_output_len").data, + (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]); + const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1]; + const uint32_t beam_width = + input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1; + + allocateBuffer(request_batch_size, beam_width, total_output_len, max_request_output_len); + + std::unordered_map ft_input_tensors = convert_inputs(input_tensors); + + std::unordered_map output_tensors = std::unordered_map{ + {"output_ids", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_UINT32, + std::vector{request_batch_size, beam_width, total_output_len}, + d_output_ids_}}, + {"sequence_length", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_UINT32, + std::vector{request_batch_size, beam_width}, + d_sequence_lengths_}}}; + + if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) { + output_tensors.insert({"output_log_probs", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_FP32, + std::vector{request_batch_size, beam_width, max_request_output_len}, + d_output_log_probs_}}); + output_tensors.insert({"cum_log_probs", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_FP32, + std::vector{request_batch_size, beam_width}, + d_cum_log_probs_}}); + } + try { + if (stream_cb_ != nullptr) { + gpt_->registerCallback(triton_stream_callback, this); + } + + gpt_->forward(&output_tensors, &ft_input_tensors, gpt_weight_.get()); + + if (stream_cb_ != nullptr) { + gpt_->unRegisterCallback(); + } + } + catch (...) { + h_exception_ = std::current_exception(); + output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}}); + } + + if (h_total_output_lengths_ != nullptr) { + free(h_total_output_lengths_); + h_total_output_lengths_ = nullptr; + } + + return convert_outputs(output_tensors); +} + +template +LlamaTritonModelInstance::~LlamaTritonModelInstance() +{ + freeBuffer(); +} + +template +void LlamaTritonModelInstance::allocateBuffer(const size_t request_batch_size, + const size_t beam_width, + const size_t total_output_len, + const size_t max_request_output_len) +{ + d_output_ids_ = (int*)(allocator_->reMalloc( + d_output_ids_, sizeof(int) * request_batch_size * beam_width * total_output_len, false)); + d_sequence_lengths_ = + (int*)(allocator_->reMalloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width, false)); + d_output_log_probs_ = (float*)(allocator_->reMalloc( + d_output_log_probs_, sizeof(float) * request_batch_size * beam_width * max_request_output_len, false)); + d_cum_log_probs_ = + (float*)(allocator_->reMalloc(d_cum_log_probs_, sizeof(float) * request_batch_size * beam_width, false)); +} + +template +void LlamaTritonModelInstance::freeBuffer() +{ + allocator_->free((void**)(&d_output_ids_)); + allocator_->free((void**)(&d_sequence_lengths_)); + allocator_->free((void**)(&d_output_log_probs_)); + allocator_->free((void**)(&d_cum_log_probs_)); +} + +template struct LlamaTritonModelInstance; +template struct LlamaTritonModelInstance; + +#ifdef ENABLE_BF16 +template struct LlamaTritonModelInstance<__nv_bfloat16>; +#endif diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h new file mode 100644 index 000000000..0a2418641 --- /dev/null +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "src/fastertransformer/models/llama/Llama.h" +#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" +#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" +#include + +namespace ft = fastertransformer; + +template +struct LlamaTritonModelInstance: AbstractTransformerModelInstance { + + LlamaTritonModelInstance(std::unique_ptr> gpt, + std::shared_ptr> gpt_weight, + std::unique_ptr> allocator, + std::unique_ptr cublas_algo_map, + std::unique_ptr cublas_wrapper_mutex, + std::unique_ptr cublas_wrapper, + std::unique_ptr cuda_device_prop_ptr); + ~LlamaTritonModelInstance(); + + std::shared_ptr> + forward(std::shared_ptr> input_tensors) override; + + std::shared_ptr> + forward(std::shared_ptr> input_tensors) override; + + static std::shared_ptr> + convert_outputs(const std::unordered_map& output_tensors); + +private: + const std::unique_ptr> gpt_; + const std::shared_ptr> gpt_weight_; + const std::unique_ptr> allocator_; + const std::unique_ptr cublas_algo_map_; + const std::unique_ptr cublas_wrapper_mutex_; + const std::unique_ptr cublas_wrapper_; + const std::unique_ptr cuda_device_prop_ptr_; + + std::unordered_map + convert_inputs(std::shared_ptr> input_tensors); + + void allocateBuffer(const size_t request_batch_size, + const size_t beam_width, + const size_t total_output_len, + const size_t max_request_output_len); + void freeBuffer(); + + int* d_input_ids_ = nullptr; + int* d_input_lengths_ = nullptr; + int* d_input_bad_words_ = nullptr; + int* d_input_stop_words_ = nullptr; + int* d_request_prompt_lengths_ = nullptr; + T* d_request_prompt_embedding_ = nullptr; + float* d_top_p_decay_ = nullptr; + float* d_top_p_min_ = nullptr; + int* d_top_p_reset_ids_ = nullptr; + + int* d_output_ids_ = nullptr; + int* d_sequence_lengths_ = nullptr; + float* d_output_log_probs_ = nullptr; + float* d_cum_log_probs_ = nullptr; + + uint32_t* h_total_output_lengths_ = nullptr; + std::exception_ptr h_exception_ = nullptr; +}; diff --git a/src/fastertransformer/triton_backend/transformer_triton_backend.hpp b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp index 47cf6750c..1567b7310 100644 --- a/src/fastertransformer/triton_backend/transformer_triton_backend.hpp +++ b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp @@ -293,6 +293,7 @@ struct AbstractTransformerModel { static std::shared_ptr createGptNeoXModel(std::string inifile); static std::shared_ptr createT5Model(std::string model_dir); static std::shared_ptr createT5EncoderModel(std::string model_dir); + static std::shared_ptr createLlamaModel(std::string inifile); std::pair, std::vector> createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false); From a6d51ec08d428cc27a0e949be3f3c5ea4fd5a0c6 Mon Sep 17 00:00:00 2001 From: void-main Date: Tue, 25 Apr 2023 05:37:26 +0000 Subject: [PATCH 06/21] register RMS for bf16 --- src/fastertransformer/kernels/layernorm_kernels.cu | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu index 369030b37..60e6f001a 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.cu +++ b/src/fastertransformer/kernels/layernorm_kernels.cu @@ -1490,6 +1490,17 @@ template void invokeGeneralAddResidualT5PreLayerNorm(half* output, int n, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeGeneralAddResidualT5PreLayerNorm(__nv_bfloat16* output, + __nv_bfloat16* norm_output, + const __nv_bfloat16* input, + const __nv_bfloat16* gamma, + const float layernorm_eps, + int m, + int n, + cudaStream_t stream); +#endif + template void invokeGeneralAddBiasResidualT5PreLayerNorm(T* output, T* norm_output, From 7a72ca33f49ba901fb093ad3d753561deca8654f Mon Sep 17 00:00:00 2001 From: void-main Date: Tue, 25 Apr 2023 12:10:57 +0000 Subject: [PATCH 07/21] revert bf16 --- src/fastertransformer/models/llama/Llama.cc | 4 ---- src/fastertransformer/models/llama/LlamaContextDecoder.cc | 4 ---- src/fastertransformer/models/llama/LlamaDecoder.cc | 4 ---- src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc | 4 ---- src/fastertransformer/models/llama/LlamaWeight.cc | 4 ---- 5 files changed, 20 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 0f547d2c5..d0305e84a 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1206,8 +1206,4 @@ bool* Llama::getFinishBuffer() template class Llama; template class Llama; -#ifdef ENABLE_BF16 -template class Llama<__nv_bfloat16>; -#endif - } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc index 900e1f016..8ea1494fb 100644 --- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc @@ -500,8 +500,4 @@ void LlamaContextDecoder::forward(std::unordered_map* template class LlamaContextDecoder; template class LlamaContextDecoder; -#ifdef ENABLE_BF16 -template class LlamaContextDecoder<__nv_bfloat16>; -#endif - } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc index dd9a51cee..23e9b0eec 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaDecoder.cc @@ -377,8 +377,4 @@ void LlamaDecoder::forward(std::unordered_map* template class LlamaDecoder; template class LlamaDecoder; -#ifdef ENABLE_BF16 -template class LlamaDecoder<__nv_bfloat16>; -#endif - } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc index dc99451b9..39a1d5c1f 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc @@ -222,8 +222,4 @@ void LlamaDecoderLayerWeight::mallocWeights() template struct LlamaDecoderLayerWeight; template struct LlamaDecoderLayerWeight; -#ifdef ENABLE_BF16 -template class LlamaDecoderLayerWeight<__nv_bfloat16>; -#endif - } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc index 01741e4f2..65c2f762d 100644 --- a/src/fastertransformer/models/llama/LlamaWeight.cc +++ b/src/fastertransformer/models/llama/LlamaWeight.cc @@ -298,8 +298,4 @@ bool LlamaWeight::isValidLayerParallelId(int l) template struct LlamaWeight; template struct LlamaWeight; -#ifdef ENABLE_BF16 -template class LlamaWeight<__nv_bfloat16>; -#endif - } // namespace fastertransformer From 9820565367d9c8f3d89a5769c74ad09cf853e781 Mon Sep 17 00:00:00 2001 From: void-main Date: Tue, 25 Apr 2023 13:15:42 +0000 Subject: [PATCH 08/21] revert bf16 --- CMakeLists.txt | 10 ++++++++-- .../triton_backend/llama/LlamaTritonModel.cc | 10 ---------- .../triton_backend/llama/LlamaTritonModelInstance.cc | 3 --- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 870e67f0a..5eb40abe8 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -348,6 +348,12 @@ add_library(transformer-shared SHARED $ $ $ + $ + $ + $ + $ + $ + $ $ $ $ @@ -428,9 +434,9 @@ target_link_libraries(transformer-shared PUBLIC -lnvToolsExt ) endif() - + if (ENABLE_FP8) -target_link_libraries(transformer-shared PUBLIC +target_link_libraries(transformer-shared PUBLIC $ $ $ diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc index 02cfc6e9e..a7f336843 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc @@ -41,13 +41,6 @@ std::shared_ptr AbstractTransformerModel::createLlamaM reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), model_dir); } - if (data_type == "bf16") { - return std::make_shared>( - reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), - reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), - reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), - model_dir); - } else { return std::make_shared>( reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), @@ -248,6 +241,3 @@ int LlamaTritonModel::getPipelineParaSize() template struct LlamaTritonModel; template struct LlamaTritonModel; -#ifdef ENABLE_BF16 -template struct LlamaTritonModel<__nv_bfloat16>; -#endif diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc index 4e6f841f9..593fc6c97 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc @@ -260,6 +260,3 @@ void LlamaTritonModelInstance::freeBuffer() template struct LlamaTritonModelInstance; template struct LlamaTritonModelInstance; -#ifdef ENABLE_BF16 -template struct LlamaTritonModelInstance<__nv_bfloat16>; -#endif From bfeebefaaf486432ca57249392d4db3b85a83570 Mon Sep 17 00:00:00 2001 From: void-main Date: Tue, 25 Apr 2023 13:38:12 +0000 Subject: [PATCH 09/21] bugfix --- examples/cpp/llama/llama_example.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 5e84e13d9..0105cb6aa 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -60,11 +60,6 @@ int main(int argc, char* argv[]) else if (data_type == "fp16") { llama_example(reader); } -#ifdef ENABLE_BF16 - else if (data_type == "bf16") { - llama_example<__nv_bfloat16>(reader); - } -#endif else { FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half)."); return -1; From 0379cc5cbf085673a2527ec9fd28f7dd99eec537 Mon Sep 17 00:00:00 2001 From: void-main Date: Fri, 28 Apr 2023 07:31:04 +0000 Subject: [PATCH 10/21] add megatron llama convert --- .../gpt/utils/megatron_ckpt_convert_llama.py | 539 ++++++++++++++++++ 1 file changed, 539 insertions(+) create mode 100644 examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py diff --git a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py new file mode 100644 index 000000000..02b4cf12a --- /dev/null +++ b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py @@ -0,0 +1,539 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import configparser +import datetime +import json +import multiprocessing +import pathlib +import re +import shutil +import sys + +import numpy as np +import torch # pytype: disable=import-error + +# verify if root package is in PYTHONPATH +__root_package_path__ = pathlib.Path(__file__).parent.parent.parent.parent.parent.absolute().as_posix() +if __root_package_path__ not in sys.path: + print( + f"[ERROR] add project root directory to your PYTHONPATH with " + f"'export PYTHONPATH={__root_package_path__}:${{PYTHONPATH}}'" + ) + +from examples.pytorch.gpt.utils.gpt import DEFAULT_START_TAG, DEFAULT_END_TAG, OPENAI_GPT2_START_ID, OPENAI_GPT2_END_ID +from examples.pytorch.utils import torch2np, safe_transpose, cpu_map_location, gpu_map_location, WEIGHT2DTYPE + + +def _inject_model_parallel_rank( + filepath, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + tensor_model_parallel_rank=0, + pipeline_model_parallel_rank=0, +): + """ + Injects tensor/pipeline model parallel ranks into the filepath. + Does nothing if not using model parallelism. + """ + filepath = pathlib.Path(filepath) + if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1: + # filepath needs to be updated to include mp_rank + if pipeline_model_parallel_size is None or pipeline_model_parallel_size == 1: + filepath = filepath.parent / f"mp_rank_{tensor_model_parallel_rank:02d}" / filepath.name + else: + filepath = ( + filepath.parent / + f"mp_rank_{tensor_model_parallel_rank:02d}_{pipeline_model_parallel_rank:03d}" / + filepath.name + ) + if not filepath.exists(): + filepath = ( + filepath.parent / + f"tp_rank_{tensor_model_parallel_rank:02d}_pp_rank_{pipeline_model_parallel_rank:03d}" / + filepath.name + ) + return filepath + else: + if filepath.exists(): + return filepath + else: + return filepath.parent / "mp_rank_00" / filepath.name + + +def _create_model_training_args_for_checkpoint_version_0(args, model_00): + model_training_args = argparse.Namespace() + if args.head_num is None or args.trained_tensor_parallel_size is None: + raise ValueError( + "Provided checkpoint have missing training args. " + "Thus it is required to provide -head_num and -trained_tensor_parallel_size CLI arguments" + ) + model_training_args.num_attention_heads = args.head_num + model_training_args.tensor_model_parallel_size = args.trained_tensor_parallel_size + # megatron ckpt_ver=0 only supports pipeline_parallel_size = 1 + model_training_args.pipeline_model_parallel_size = 1 + model_training_args.max_position_embeddings = \ + model_00["model"]["language_model"]["embedding"]["position_embeddings"]["weight"].shape[0] + model_training_args.hidden_size = \ + model_00["model"]["language_model"]["embedding"]["position_embeddings"]["weight"].shape[1] + model_training_args.ffn_hidden_size = 4 * model_training_args.hidden_size + + def get_layer_num_from_weights(model_keys): + layer_num = 1 + for key in model_keys: + if re.search(r'\d+', key) is not None: + layer_num = max(int(re.search(r'\d+', key).group()), layer_num) + return layer_num + 1 + + model_training_args.num_layers = \ + get_layer_num_from_weights(model_00["model"]["language_model"]['transformer'].keys()) + + model_training_args.layernorm_epsilon = 1e-6 + + return model_training_args + + +# This tool is used to support the new megatron model trained by pipeline parallel + tensor parallel +def merge_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, model_training_args, transformer_model_list, ckpt_ver, np_weight_data_type): + saved_dir = pathlib.Path(saved_dir) + if key.find("layers.") != -1: + layer_index = (int)(key[7 : key.find(".", 7)]) + saved_key = key.replace( + "layers.%d." % layer_index, + "layers.%d." % (layer_index + pipeline_para_rank * model_training_args.num_layers // model_training_args.pipeline_model_parallel_size)) + + if saved_key.find("self_attention") != -1: + saved_key = saved_key.replace("self_attention", "attention") + if saved_key.find("adaptor1") != -1: + saved_key = saved_key.replace("adaptor1", "after_attention_adapter") + if saved_key.find("adaptor2") != -1: + saved_key = saved_key.replace("adaptor2", "after_ffn_adapter") + else: + saved_key = key + major_device = transformer_model_list[0][key].device + + if ( + key.find("input_layernorm.weight") != -1 + or key.find("post_attention_layernorm.weight") != -1 + or key.find("final_layernorm.weight") != -1): + + # shared weights, only need to convert the weights of rank 0 + if i == 0: + saved_path = saved_dir / f"model.{saved_key}.bin" + val = safe_transpose(transformer_model_list[0][key]) + val = torch2np(val, np_weight_data_type) + val = np.squeeze(val) + val.tofile(saved_path) + + elif (key.find("attention.dense.weight") != -1 + or key.find("mlp.dense_4h_to_h.weight") != -1 + or key.find("adaptor1.dense_4h_to_h.weight") != -1 + or key.find("adaptor2.dense_4h_to_h.weight") != -1): + vals = [ + safe_transpose(transformer_model_list[k][key]).float().to(major_device) + for k in range(factor) + ] + val = torch.cat(vals, dim=0) + val = torch2np(val, np_weight_data_type) + saved_path = saved_dir / f"model.{saved_key}.{i:d}.bin" + val.tofile(saved_path) + + elif (key.find("mlp.dense_h_to_4h.weight") != -1 + or key.find("adaptor1.dense_h_to_4h.weight") != -1 + or key.find("adaptor2.dense_h_to_4h.weight") != -1): + vals = [ + safe_transpose(transformer_model_list[k][key]).float().to(major_device) + for k in range(factor) + ] + val = torch.cat(vals, dim=-1) + val = torch2np(val, np_weight_data_type) + saved_path = saved_dir / f"model.{saved_key}.{i:d}.bin" + val.tofile(saved_path) + + + elif key.find("attention.query_key_value.weight") != -1: + vals = [] + for k in range(factor): + val = safe_transpose(transformer_model_list[k][key]).float() + hidden_dim = val.shape[0] + local_dim = int(val.shape[-1] / 3) + if ckpt_ver == 3: + num_splits = 3 + head_num = model_training_args.num_attention_heads + size_per_head = hidden_dim // head_num + head_num = head_num // model_training_args.tensor_model_parallel_size + val = val.reshape(hidden_dim, head_num, num_splits, size_per_head) + val = val.permute(0, 2, 1, 3) + val = val.reshape(hidden_dim, 3, local_dim) + vals.append(val.to(major_device)) + val = torch.cat(vals, dim=-1) + val = torch2np(val, np_weight_data_type) + saved_path = saved_dir / f"model.{saved_key}.{i:d}.bin" + val.tofile(saved_path) + + else: + print(f"[ERROR] cannot find key '{key}'") + +def preprocess_h_to_4h(val, training_tp_size, np_weight_data_type): + """ + megatron saved format: [TP_train, 2, inter_size / 2 / TP, hidden_size] + FT needed format: [hidden_size, inter_size], [hidden_size, inter_size] + This function translates megatron weight to FT weight + """ + val_shape = val.shape + val = val.view(training_tp_size, + 2, + val_shape[0] // (2 * training_tp_size), + val_shape[1]) + val = val.transpose(0, 1).contiguous() + val = val.view(*val_shape) + + val = safe_transpose(val) + return torch2np(val, np_weight_data_type) + +def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, model_training_args, transformer_model_list, ckpt_ver, np_weight_data_type): + val = safe_transpose(transformer_model_list[0][key]) + val = torch2np(val, np_weight_data_type) + print(f"key: {key}, shape: {val.shape}") + if key.find("layers.") != -1: + layer_index = (int)(key[7 : key.find(".", 7)]) + saved_key = key.replace( + "layers.%d." % layer_index, + "layers.%d." % (layer_index + pipeline_para_rank * model_training_args.num_layers // model_training_args.pipeline_model_parallel_size)) + + if saved_key.find("self_attention") != -1: + saved_key = saved_key.replace("self_attention", "attention") + if saved_key.find("mlp.dense_4h_to_h")!= -1: + saved_key = saved_key.replace("mlp.dense_4h_to_h", "mlp.down_proj") + + else: + saved_key = key + + if ( + key.find("input_layernorm.weight") != -1 + or key.find("post_attention_layernorm.weight") != -1 + or key.find("final_layernorm.weight") != -1 + ): + # shared weights, only need to convert the weights of rank 0 + if i == 0: + saved_path = saved_dir / f"model.{saved_key}.bin" + val.tofile(saved_path.as_posix()) + + elif (key.find("attention.dense.weight") != -1 + or key.find("mlp.dense_4h_to_h.weight") != -1): + split_vals = np.split(val, factor, axis=0) + for j in range(factor): + print(f'saving {saved_key}, shape: {split_vals[j].shape}') + saved_path = saved_dir / f"model.{saved_key}.{i * factor + j:d}.bin" + split_vals[j].tofile(saved_path.as_posix()) + + elif (key.find("mlp.dense_h_to_4h.weight") != -1): + gate_weight, up_weight = np.split(val, 2, axis=-1) + print(f'dense_h_to_4h shape: {val.shape}, gate shape: {gate_weight.shape}, up shape: {up_weight.shape}') + + split_gate_weight = np.split(gate_weight, factor, axis=-1) + proj_key = saved_key.replace('mlp.dense_h_to_4h.weight','mlp.gate_proj.weight') + for j in range(factor): + print(f'saving {proj_key}, shape: {split_gate_weight[j].shape}') + saved_path = saved_dir / f"model.{proj_key}.{i * factor + j:d}.bin" + split_gate_weight[j].tofile(saved_path.as_posix()) + + split_up_weight = np.split(up_weight, factor, axis=-1) + proj_key = saved_key.replace('mlp.dense_h_to_4h.weight','mlp.up_proj.weight') + for j in range(factor): + print(f'saving {proj_key}, shape: {split_up_weight[j].shape}') + saved_path = saved_dir / f"model.{proj_key}.{i * factor + j:d}.bin" + split_up_weight[j].tofile(saved_path.as_posix()) + + + elif key.find("attention.query_key_value.weight") != -1: + hidden_dim = val.shape[0] + local_dim = int(val.shape[-1] / 3) + + if ckpt_ver == 3: + num_splits = 3 + head_num = model_training_args.num_attention_heads + size_per_head = hidden_dim // head_num + head_num = head_num // model_training_args.tensor_model_parallel_size + val = val.reshape(hidden_dim, head_num, num_splits, size_per_head) + val = val.transpose(0, 2, 1, 3) + + val = val.reshape(hidden_dim, 3, local_dim) + split_vals = np.split(val, factor, axis=-1) + + for j in range(factor): + print(f'saving {saved_key}, shape: {split_vals[j].shape}') + saved_path = saved_dir / f"model.{saved_key}.{i * factor + j:d}.bin" + split_vals[j].tofile(saved_path.as_posix()) + + else: + print(f"[ERROR] cannot find key '{key}'") + + +def _get_checkpoint_name(checkpoint_dir): + + checkpoint_dir = pathlib.Path(checkpoint_dir) + patterns = [ + "model_optim_rng.pt", # older megatron checkpoints + "*last.ckpt", # newer format of checkpoints + ] + for pattern in patterns: + model_files = sorted(list(checkpoint_dir.rglob(pattern))) + if model_files: + return model_files[0].name + + raise ValueError(f"Could not find checkpoint files in {checkpoint_dir}") + + +def convert_checkpoint(args): + saved_dir = pathlib.Path(args.saved_dir) / f"{args.infer_gpu_num:d}-gpu" + if saved_dir.exists(): + print(f"[ERROR] Remove {saved_dir} target directory before running conversion") + sys.exit(1) + saved_dir.mkdir(parents=True) + + if args.vocab_path: + shutil.copy(args.vocab_path, (saved_dir / "vocab.json").as_posix()) + if args.merges_path: + shutil.copy(args.merges_path, (saved_dir / "merges.txt").as_posix()) + + load_checkpoints_to_cpu = bool(args.load_checkpoints_to_cpu) + map_location_fn = cpu_map_location if load_checkpoints_to_cpu else gpu_map_location + + checkpoints_dir = pathlib.Path(args.in_file) + checkpoint_name = _get_checkpoint_name(checkpoints_dir) + + # load position_embedding from rank 0 + checkpoints_paths = sorted(checkpoints_dir.rglob(checkpoint_name)) + if not checkpoints_paths: + print(f"[ERROR] Cannot find checkpoint in {checkpoints_dir}.") + exit(1) + model_00 = torch.load(checkpoints_paths[0].as_posix(), map_location=map_location_fn) + + if "hyper_parameters" in list(model_00.keys()): + print("Use nemo_ckpt_converter.py script for conversion of this checkpoint") + exit(1) + elif "args" in list(model_00.keys()): + checkpoint_version = model_00["checkpoint_version"] + model_training_args = model_00["args"] + megatron_gpt_key = "encoder" + else: + checkpoint_version = 0 + model_training_args = _create_model_training_args_for_checkpoint_version_0(args, model_00) + megatron_gpt_key = "transformer" + + with (saved_dir / "args.txt").open("w") as training_args_file: + for k, v in vars(model_training_args).items(): + training_args_file.write(f"{k}:{v}\n") + + np_weight_data_type = WEIGHT2DTYPE[args.weight_data_type] + + del model_00 + w_e_list = [] + w_e_head_list = [] + + training_tensor_para_size = model_training_args.tensor_model_parallel_size + training_pipeline_para_size = model_training_args.pipeline_model_parallel_size + inference_tensor_para_size = args.infer_gpu_num + + model_weights_paths = [ + [ + _inject_model_parallel_rank( + checkpoints_dir / checkpoint_name, + tensor_model_parallel_size=training_tensor_para_size, + pipeline_model_parallel_size=training_pipeline_para_size, + tensor_model_parallel_rank=tp_rank, + pipeline_model_parallel_rank=pp_rank, + ) + for pp_rank in range(training_pipeline_para_size) + ] + for tp_rank in range(training_tensor_para_size) + ] + + if training_tensor_para_size > inference_tensor_para_size: + assert training_tensor_para_size % inference_tensor_para_size == 0 + is_merge_ckpt = True + factor = int(training_tensor_para_size / inference_tensor_para_size) + else: + assert inference_tensor_para_size % training_tensor_para_size == 0 + is_merge_ckpt = False + factor = int(inference_tensor_para_size / training_tensor_para_size) + + main_loop = min(training_tensor_para_size, inference_tensor_para_size) + vocab_size_list = [0 for i in range(main_loop)] + + torch.multiprocessing.set_start_method("spawn") + torch.multiprocessing.set_sharing_strategy("file_system") + pool = multiprocessing.Pool(args.processes) + has_adapters = False + for i in range(main_loop): # tp + for j in range(training_pipeline_para_size): # pp + + transformer_models = [] + if is_merge_ckpt: + for k in range(factor): + m = torch.load(model_weights_paths[i * factor + k][j].as_posix(), map_location=map_location_fn) + if not has_adapters: + has_adapters = any("adaptor" in key for key in m['model']['language_model'][megatron_gpt_key].keys()) + transformer_models.append(m["model"]["language_model"][megatron_gpt_key]) + + if j == 0: + vocab_size_list[i] = m["model"]["language_model"]["embedding"]["word_embeddings"]["weight"].shape[0] + w_e_list.append(torch2np(m["model"]["language_model"]["embedding"]["word_embeddings"]["weight"], np_weight_data_type)) + if j == training_pipeline_para_size - 1: + w_e_head_list.append(torch2np(m["model"]["word_embeddings"]["weight"], np_weight_data_type)) + + else: + m = torch.load(model_weights_paths[i][j].as_posix(), map_location=map_location_fn) + + if not has_adapters: + has_adapters = any("adaptor" in key for key in m['model']['language_model'][megatron_gpt_key].keys()) + + if j == 0: + vocab_size_list[i] = m["model"]["language_model"]["embedding"]["word_embeddings"]["weight"].shape[0] + w_e_list.append(torch2np( + m["model"]["language_model"]["embedding"]["word_embeddings"]["weight"], + np_weight_data_type + )) + if j == training_pipeline_para_size - 1: + w_e_head_list.append(torch2np( + m["model"]["word_embeddings_for_head"]["weight"], + np_weight_data_type + )) + transformer_models.append(m["model"]["language_model"][megatron_gpt_key]) + + pool.starmap( + merge_and_convert_process if is_merge_ckpt else split_and_convert_process, + [ + ( + i, + j, + saved_dir, + factor, + k, + model_training_args, + transformer_models, + checkpoint_version, + np_weight_data_type, + ) + for (k, v) in transformer_models[0].items() + ], + ) + + pool.close() + pool.join() + + torch.cuda.synchronize() + + np.concatenate(w_e_list, axis=0).tofile((saved_dir / "model.wte.weight.bin").as_posix()) + np.concatenate(w_e_head_list, axis=0).tofile((saved_dir / "model.lm_head.weight.bin").as_posix()) + + # save vocab_size + full_vocab_size = sum(vocab_size_list) + if not hasattr(model_training_args, "padded_vocab_size"): + model_training_args.padded_vocab_size = full_vocab_size + + # Configuration for the model (load by triton backends) + config = configparser.ConfigParser() + config["llama"] = {} + try: + config["llama"]["model_name"] = "llama" + config["llama"]["head_num"] = str(model_training_args.num_attention_heads) + config["llama"]["size_per_head"] = str(model_training_args.hidden_size // model_training_args.num_attention_heads) + config["llama"]["rotary_embedding"] = str(model_training_args.hidden_size // model_training_args.num_attention_heads) + config["llama"]["inter_size"] = str(model_training_args.ffn_hidden_size) + config["llama"]["num_layer"] = str(model_training_args.num_layers) + config["llama"]["max_pos_seq_len"] = str(model_training_args.max_position_embeddings) + config["llama"]["vocab_size"] = str(model_training_args.padded_vocab_size) + config["llama"]["start_id"] = '0' + config["llama"]["end_id"] = '1' + config["llama"]["weight_data_type"] = args.weight_data_type + config["llama"]["tensor_para_size"] = str(args.infer_gpu_num) + with open((saved_dir / f"config.ini").as_posix(), 'w') as configfile: + config.write(configfile) + except Exception as e: + print(f"Fail to save the config in config.ini: {e}") + + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("--saved-dir", "-saved_dir", "-o", help="folder name of output files", required=True) + parser.add_argument( + "--in-file", "-in_file", "-i", help="file name of input checkpoint file", required=True + ) + parser.add_argument( + "--infer-gpu-num", "-infer_gpu_num", "-i_g", type=int, help="How many gpus for inference", required=True + ) + # -h_n and -t_g are needed when megatron_ckpt_version = 0, for example the public megatron 345M gpt model + parser.add_argument( + "--head-num", + "-head_num", + "-h_n", + type=int, + help="The number of heads, only needed when weight doesn't contain structure hyperparameters" + ) + parser.add_argument( + "--trained-tensor-parallel-size", + "-trained_tensor_parallel_size", + "-t_g", + type=int, + help="the tensor parallel size for training" + ) + parser.add_argument( + "--processes", + "-processes", + "-p", + type=int, + default=16, + help="How many processes to spawn for conversion", + ) + parser.add_argument( + "--weight-data-type", "-weight_data_type", choices=["fp32", "fp16"], default="fp32", help="" + ) + parser.add_argument( + "--load-checkpoints-to-cpu", + "-load_checkpoints_to_cpu", + "-cpu", + type=int, + choices=[0, 1], + default=1, + help="Whether to load model weights to CPU", + ) + parser.add_argument( + "--vocab-path", + type=str, + help="Path to vocabulary file to embed in FasterTransformer checkpoint", + required=False, + ) + parser.add_argument( + "--merges-path", type=str, help="Path to merges file to embed in FasterTransformer checkpoint", required=False + ) + + args = parser.parse_args() + print("\n=============== Argument ===============") + for key in vars(args): + print(f"{key}: {vars(args)[key]}") + print("========================================") + + print("[INFO] Started to convert the model, normally it takes around 10 minutes.") + + start_time = datetime.datetime.now() + convert_checkpoint(args) + run_time = datetime.datetime.now() - start_time + print(f"[INFO] Spent {run_time} (h:m:s) to convert the model") + + +if __name__ == "__main__": + main() From d65adf119c2ffd30b7804360fe8a2f198dfe0f20 Mon Sep 17 00:00:00 2001 From: Void Main Date: Sat, 29 Apr 2023 01:15:33 +0800 Subject: [PATCH 11/21] Update src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h Co-authored-by: Bram Wasti --- .../triton_backend/llama/LlamaTritonModelInstance.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h index 0a2418641..a75e0692d 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h @@ -45,9 +45,9 @@ struct LlamaTritonModelInstance: AbstractTransformerModelInstance { convert_outputs(const std::unordered_map& output_tensors); private: + const std::unique_ptr> allocator_; const std::unique_ptr> gpt_; const std::shared_ptr> gpt_weight_; - const std::unique_ptr> allocator_; const std::unique_ptr cublas_algo_map_; const std::unique_ptr cublas_wrapper_mutex_; const std::unique_ptr cublas_wrapper_; From cf1b9b170c13c8648a49ee464c0037a6d4d61a73 Mon Sep 17 00:00:00 2001 From: void-main Date: Sat, 29 Apr 2023 14:34:12 +0000 Subject: [PATCH 12/21] donot callback too frequnetly --- src/fastertransformer/models/llama/Llama.cc | 17 ++++++++++++++++- src/fastertransformer/models/llama/Llama.h | 3 +++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index d0305e84a..f078861e2 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -70,6 +70,21 @@ void Llama::initialize() allocator_, is_free_buffer_after_forward_, cuda_device_prop_); + + // parse env overrides + if (std::getenv("LLAMA_STREAM_CB_STEP") != nullptr) { + try { + int callback_step_from_env = stoi( + std::string(std::getenv("LLAMA_STREAM_CB_STEP")) + ); + token_generated_cb_step_ = callback_step_from_env; + FT_LOG_INFO("Override stream callback step to %d from LLAMA_STREAM_CB_STEP", + token_generated_cb_step_); + } catch (...) { + FT_LOG_WARNING("convert LLAMA_STREAM_CB_STEP err, use default value %d", + token_generated_cb_step_); + } + } } template @@ -1014,7 +1029,7 @@ void Llama::forward(std::unordered_map* output_ten if (*generation_should_stop_) { break; } - if (token_generated_cb_ && step + 1 < (int)max_output_seq_len) { + if (token_generated_cb_ && (step + 1) % token_generated_cb_step_ == 0 && step + 1 < (int)max_output_seq_len) { setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); sendTensorsToFirstPipelineNode(output_tensors, input_tensors); diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h index 8ba6a88df..587a6f72a 100644 --- a/src/fastertransformer/models/llama/Llama.h +++ b/src/fastertransformer/models/llama/Llama.h @@ -129,6 +129,9 @@ class Llama: public BaseLayer { callback_sig* token_generated_cb_ = nullptr; void* token_generated_ctx_ = nullptr; + // callback step + size_t token_generated_cb_step_ = 50; // default 50, override by env LLAMA_STREAM_CB_STEP + void setOutputTensors(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, const size_t max_input_length, From 95afed4a569e37942f095fb7bf6659e7641167db Mon Sep 17 00:00:00 2001 From: void-main Date: Sat, 29 Apr 2023 23:25:40 +0000 Subject: [PATCH 13/21] add bf16 --- src/fastertransformer/models/llama/Llama.cc | 3 +++ .../models/llama/LlamaContextDecoder.cc | 3 +++ .../models/llama/LlamaDecoder.cc | 3 +++ .../models/llama/LlamaDecoderLayerWeight.cc | 3 +++ .../models/llama/LlamaWeight.cc | 3 +++ .../triton_backend/llama/LlamaTritonModel.cc | 18 +++++++++++++++++- .../llama/LlamaTritonModelInstance.cc | 4 +++- 7 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index f078861e2..2c1c2db5f 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1220,5 +1220,8 @@ bool* Llama::getFinishBuffer() template class Llama; template class Llama; +#ifdef ENABLE_BF16 +template class Llama<__nv_bfloat16>; +#endif } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc index 8ea1494fb..e3afd1780 100644 --- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc @@ -499,5 +499,8 @@ void LlamaContextDecoder::forward(std::unordered_map* template class LlamaContextDecoder; template class LlamaContextDecoder; +#ifdef ENABLE_BF16 +template class LlamaContextDecoder<__nv_bfloat16>; +#endif } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc index 23e9b0eec..a5cffa731 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaDecoder.cc @@ -376,5 +376,8 @@ void LlamaDecoder::forward(std::unordered_map* template class LlamaDecoder; template class LlamaDecoder; +#ifdef ENABLE_BF16 +template class LlamaDecoder<__nv_bfloat16>; +#endif } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc index 39a1d5c1f..3e97b67d0 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc @@ -221,5 +221,8 @@ void LlamaDecoderLayerWeight::mallocWeights() template struct LlamaDecoderLayerWeight; template struct LlamaDecoderLayerWeight; +#ifdef ENABLE_BF16 +template class LlamaDecoderLayerWeight<__nv_bfloat16>; +#endif } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc index 65c2f762d..6105267ff 100644 --- a/src/fastertransformer/models/llama/LlamaWeight.cc +++ b/src/fastertransformer/models/llama/LlamaWeight.cc @@ -297,5 +297,8 @@ bool LlamaWeight::isValidLayerParallelId(int l) template struct LlamaWeight; template struct LlamaWeight; +#ifdef ENABLE_BF16 +template class LlamaWeight<__nv_bfloat16>; +#endif } // namespace fastertransformer diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc index a7f336843..6816e34e6 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc @@ -41,6 +41,15 @@ std::shared_ptr AbstractTransformerModel::createLlamaM reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), model_dir); } +#ifdef ENABLE_BF16 + else if (data_type == "bf16") { + return std::make_shared>( + reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), + model_dir); + } +#endif else { return std::make_shared>( reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), @@ -127,6 +136,11 @@ std::unique_ptr LlamaTritonModel::createMod if (std::is_same::value) { cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); } +#ifdef ENABLE_BF16 + else if (std::is_same::value) { + cublas_wrapper->setBF16GemmConfig(); + } +#endif else if (std::is_same::value) { cublas_wrapper->setFP32GemmConfig(); } @@ -240,4 +254,6 @@ int LlamaTritonModel::getPipelineParaSize() template struct LlamaTritonModel; template struct LlamaTritonModel; - +#ifdef ENABLE_BF16 +template class LlamaTritonModel<__nv_bfloat16>; +#endif diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc index 593fc6c97..e46adf87d 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc @@ -259,4 +259,6 @@ void LlamaTritonModelInstance::freeBuffer() template struct LlamaTritonModelInstance; template struct LlamaTritonModelInstance; - +#ifdef ENABLE_BF16 +template class LlamaTritonModelInstance<__nv_bfloat16>; +#endif From 9aee02eb6e54e172d34ca7cac0c908cd778832a2 Mon Sep 17 00:00:00 2001 From: void-main Date: Sat, 29 Apr 2023 23:52:01 +0000 Subject: [PATCH 14/21] make sure examples work for bf16 --- examples/cpp/llama/llama_example.cc | 10 ++++++++++ src/fastertransformer/models/llama/Llama.cc | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 0105cb6aa..e80f59b1f 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -60,6 +60,11 @@ int main(int argc, char* argv[]) else if (data_type == "fp16") { llama_example(reader); } +#ifdef ENABLE_BF16 + else if (data_type == "bf16") { + llama_example<__nv_bfloat16>(reader); + } +#endif else { FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half)."); return -1; @@ -259,6 +264,11 @@ void llama_example(const INIReader reader) if (std::is_same::value) { cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); } +#ifdef ENABLE_BF16 + else if (std::is_same::value) { + cublas_wrapper.setBF16GemmConfig(); + } +#endif else if (std::is_same::value) { cublas_wrapper.setFP32GemmConfig(); } diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 2c1c2db5f..2fe508803 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -269,7 +269,11 @@ Llama::Llama(size_t head_num, pipeline_para_.rank_ = 0; int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); - if (std::is_same::value) { + if (std::is_same::value +#ifdef ENABLE_BF16 + || std::is_same<__nv_bfloat16, T>::value +#endif + ) { local_vacab_size = ceil(local_vacab_size / 8.f) * 8; } vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_; From 8ddac81e2d278f0451d7021c130a7f2c301612e3 Mon Sep 17 00:00:00 2001 From: void-main Date: Sun, 30 Apr 2023 02:54:29 +0000 Subject: [PATCH 15/21] support bf16 conversion with bfloat 16 numpy ext --- examples/cpp/llama/huggingface_llama_convert.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py index 37868d0bd..f45c8b99b 100644 --- a/examples/cpp/llama/huggingface_llama_convert.py +++ b/examples/cpp/llama/huggingface_llama_convert.py @@ -20,11 +20,17 @@ import os from transformers import LlamaForCausalLM +# using numpy extension: https://github.com/GreenWaves-Technologies/bfloat16 +# install the library with `pip install bfloat16` +from bfloat16 import bfloat16 + def get_weight_data_type(data_type): if data_type == "fp32": return np.float32 elif data_type == "fp16": return np.float16 + elif data_type == "bf16": + return bfloat16 else: assert False, f"Invalid weight data type {data_type}" @@ -175,7 +181,7 @@ def split_and_convert(args): parser.add_argument('-in_file', '-i', type=str, help='file name of input checkpoint file', required=True) parser.add_argument('-trained_gpu_num', '-t_g', type=int, help='How many gpus for inference', default=1) parser.add_argument('-infer_gpu_num', '-i_g', type=int, help='How many gpus for inference', required=True) - parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"]) + parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16", "bf16"]) parser.add_argument('-model_name', '-m_n', type=str, help='model name', required=True) args = parser.parse_args() From 40fbe484dcdbc209ff701eb955776152c06847d5 Mon Sep 17 00:00:00 2001 From: void-main Date: Mon, 1 May 2023 11:05:17 +0000 Subject: [PATCH 16/21] bugfix --- .../pytorch/gpt/utils/megatron_ckpt_convert_llama.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py index 02b4cf12a..e15c25e27 100644 --- a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py +++ b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py @@ -206,7 +206,6 @@ def preprocess_h_to_4h(val, training_tp_size, np_weight_data_type): def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, model_training_args, transformer_model_list, ckpt_ver, np_weight_data_type): val = safe_transpose(transformer_model_list[0][key]) val = torch2np(val, np_weight_data_type) - print(f"key: {key}, shape: {val.shape}") if key.find("layers.") != -1: layer_index = (int)(key[7 : key.find(".", 7)]) saved_key = key.replace( @@ -235,25 +234,24 @@ def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, mod or key.find("mlp.dense_4h_to_h.weight") != -1): split_vals = np.split(val, factor, axis=0) for j in range(factor): - print(f'saving {saved_key}, shape: {split_vals[j].shape}') saved_path = saved_dir / f"model.{saved_key}.{i * factor + j:d}.bin" split_vals[j].tofile(saved_path.as_posix()) elif (key.find("mlp.dense_h_to_4h.weight") != -1): + val = preprocess_h_to_4h(transformer_model_list[0][key], + model_training_args.tensor_model_parallel_size, + np_weight_data_type) gate_weight, up_weight = np.split(val, 2, axis=-1) - print(f'dense_h_to_4h shape: {val.shape}, gate shape: {gate_weight.shape}, up shape: {up_weight.shape}') split_gate_weight = np.split(gate_weight, factor, axis=-1) proj_key = saved_key.replace('mlp.dense_h_to_4h.weight','mlp.gate_proj.weight') for j in range(factor): - print(f'saving {proj_key}, shape: {split_gate_weight[j].shape}') saved_path = saved_dir / f"model.{proj_key}.{i * factor + j:d}.bin" split_gate_weight[j].tofile(saved_path.as_posix()) split_up_weight = np.split(up_weight, factor, axis=-1) proj_key = saved_key.replace('mlp.dense_h_to_4h.weight','mlp.up_proj.weight') for j in range(factor): - print(f'saving {proj_key}, shape: {split_up_weight[j].shape}') saved_path = saved_dir / f"model.{proj_key}.{i * factor + j:d}.bin" split_up_weight[j].tofile(saved_path.as_posix()) @@ -274,7 +272,6 @@ def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, mod split_vals = np.split(val, factor, axis=-1) for j in range(factor): - print(f'saving {saved_key}, shape: {split_vals[j].shape}') saved_path = saved_dir / f"model.{saved_key}.{i * factor + j:d}.bin" split_vals[j].tofile(saved_path.as_posix()) From f6cf9da598e01296d055be95113f25d79a93b02e Mon Sep 17 00:00:00 2001 From: void-main Date: Mon, 1 May 2023 23:18:25 +0000 Subject: [PATCH 17/21] load layernorm_eps from config; change cb default to 5 --- examples/cpp/llama/huggingface_llama_convert.py | 1 + examples/cpp/llama/llama_example.cc | 2 ++ src/fastertransformer/models/llama/Llama.cc | 5 +++++ src/fastertransformer/models/llama/Llama.h | 6 ++++-- .../triton_backend/llama/LlamaTritonModel.cc | 6 ++++-- .../triton_backend/llama/LlamaTritonModel.h | 1 + 6 files changed, 17 insertions(+), 4 deletions(-) diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py index 37868d0bd..92841a9c0 100644 --- a/examples/cpp/llama/huggingface_llama_convert.py +++ b/examples/cpp/llama/huggingface_llama_convert.py @@ -92,6 +92,7 @@ def split_and_convert(args): config['llama']["inter_size"] = str(hf_config["intermediate_size"]) config['llama']["num_layer"] = str(num_layers) config['llama']["rotary_embedding"] = str(head_size) + config['llama']['layernorm_eps'] = str(hf_config["rms_norm_eps"]) config['llama']["vocab_size"] = str(hf_config["vocab_size"]) config['llama']["start_id"] = str(hf_config["bos_token_id"]) config['llama']["end_id"] = str(hf_config["eos_token_id"]) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index e80f59b1f..a64c86bca 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -87,6 +87,7 @@ void llama_example(const INIReader reader) const size_t vocab_size = reader.GetInteger(model_name, "vocab_size"); const size_t decoder_layers = reader.GetInteger(model_name, "num_layer"); const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding"); + const float layernorm_eps = reader.GetFloat(model_name, "layernorm_eps"); const int start_id = reader.GetInteger(model_name, "start_id"); const int end_id = reader.GetInteger(model_name, "end_id"); @@ -310,6 +311,7 @@ void llama_example(const INIReader reader) decoder_layers, vocab_size, rotary_embedding_dim, + layernorm_eps, start_id, end_id, prompt_learning_start_id, diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 2fe508803..37ee21b85 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -227,6 +227,7 @@ Llama::Llama(size_t head_num, size_t num_layer, size_t vocab_size, size_t rotary_embedding_dim, + float layernorm_eps, int start_id, int end_id, int prompt_learning_start_id, // only needed by p/prompt-tuning @@ -254,6 +255,7 @@ Llama::Llama(size_t head_num, num_layer_(num_layer), vocab_size_(vocab_size), rotary_embedding_dim_(rotary_embedding_dim), + layernorm_eps_(layernorm_eps), start_id_(start_id), end_id_(end_id), prompt_learning_start_id_(prompt_learning_start_id), @@ -287,6 +289,7 @@ Llama::Llama(size_t head_num, size_t num_layer, size_t vocab_size, size_t rotary_embedding_dim, + float layernorm_eps, int start_id, int end_id, int prompt_learning_start_id, // only needed by p/prompt-tuning @@ -316,6 +319,7 @@ Llama::Llama(size_t head_num, num_layer_(num_layer), vocab_size_(vocab_size), rotary_embedding_dim_(rotary_embedding_dim), + layernorm_eps_(layernorm_eps), start_id_(start_id), end_id_(end_id), prompt_learning_start_id_(prompt_learning_start_id), @@ -346,6 +350,7 @@ Llama::Llama(Llama const& gpt): num_layer_(gpt.num_layer_), vocab_size_(gpt.vocab_size_), rotary_embedding_dim_(gpt.rotary_embedding_dim_), + layernorm_eps_(gpt.layernorm_eps_), start_id_(gpt.start_id_), end_id_(gpt.end_id_), prompt_learning_start_id_(gpt.prompt_learning_start_id_), diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h index 587a6f72a..f318e6947 100644 --- a/src/fastertransformer/models/llama/Llama.h +++ b/src/fastertransformer/models/llama/Llama.h @@ -38,9 +38,9 @@ class Llama: public BaseLayer { size_t num_layer_; size_t vocab_size_; size_t rotary_embedding_dim_; + float layernorm_eps_; static constexpr bool neox_rotary_style_ = true; - static constexpr float layernorm_eps_ = 1e-6f; int start_id_; int end_id_; @@ -130,7 +130,7 @@ class Llama: public BaseLayer { void* token_generated_ctx_ = nullptr; // callback step - size_t token_generated_cb_step_ = 50; // default 50, override by env LLAMA_STREAM_CB_STEP + size_t token_generated_cb_step_ = 5; // default 5, override by env LLAMA_STREAM_CB_STEP void setOutputTensors(std::unordered_map* output_tensors, const std::unordered_map* input_tensors, @@ -146,6 +146,7 @@ class Llama: public BaseLayer { size_t num_layer, size_t vocab_size, size_t rotary_embedding_dim, + float layernorm_eps, int start_id, int end_id, int prompt_learning_start_id, // only needed by p/prompt-tuning @@ -173,6 +174,7 @@ class Llama: public BaseLayer { size_t num_layer, size_t vocab_size, size_t rotary_embedding_dim, + float layernorm_eps, int start_id, int end_id, int prompt_learning_start_id, // only needed by p/prompt-tuning diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc index 6816e34e6..ea34983e4 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc @@ -84,6 +84,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, num_layer_ = reader.GetInteger("llama", "num_layer"); vocab_size_ = reader.GetInteger("llama", "vocab_size"); rotary_embedding_dim_ = reader.GetInteger("llama", "rotary_embedding"); + layernorm_eps_ = reader.GetFloat("llama", "layernorm_eps"); start_id_ = reader.GetInteger("llama", "start_id"); end_id_ = reader.GetInteger("llama", "end_id"); use_gptj_residual_ = false; @@ -162,6 +163,7 @@ std::unique_ptr LlamaTritonModel::createMod num_layer_, vocab_size_, rotary_embedding_dim_, + layernorm_eps_, start_id_, end_id_, prompt_learning_start_id_, // p/prompt tuning virtual token start id @@ -223,8 +225,8 @@ std::string LlamaTritonModel::toString() std::stringstream ss; ss << "Model: " << "\nhead_num: " << head_num_ << "\nsize_per_head: " << size_per_head_ << "\ninter_size: " << inter_size_ - << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nstart_id: " << start_id_ - << "\nend_id: " << end_id_ << "\nuse_gptj_residual: " << use_gptj_residual_ + << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nlayernorm_eps: " << layernorm_eps_ + << "\nstart_id: " << start_id_ << "\nend_id: " << end_id_ << "\nuse_gptj_residual: " << use_gptj_residual_ << "\nprompt_learning_type_: " << static_cast(prompt_learning_type_) << "\nprompt_learning_start_id_: " << prompt_learning_start_id_ << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h index 0775ed05f..da5a277cd 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h @@ -57,6 +57,7 @@ struct LlamaTritonModel: public AbstractTransformerModel { size_t num_layer_; size_t vocab_size_; size_t rotary_embedding_dim_; + float layernorm_eps_; int start_id_; int end_id_; size_t tensor_para_size_; From da2ad14e33c46b8ec7e5c20e2600c0bf003a2e9e Mon Sep 17 00:00:00 2001 From: void-main Date: Tue, 2 May 2023 02:10:02 +0000 Subject: [PATCH 18/21] update megatron convert script --- .../gpt/utils/megatron_ckpt_convert_llama.py | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py index e15c25e27..6ebec96d3 100644 --- a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py +++ b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py @@ -186,23 +186,6 @@ def merge_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, mod else: print(f"[ERROR] cannot find key '{key}'") -def preprocess_h_to_4h(val, training_tp_size, np_weight_data_type): - """ - megatron saved format: [TP_train, 2, inter_size / 2 / TP, hidden_size] - FT needed format: [hidden_size, inter_size], [hidden_size, inter_size] - This function translates megatron weight to FT weight - """ - val_shape = val.shape - val = val.view(training_tp_size, - 2, - val_shape[0] // (2 * training_tp_size), - val_shape[1]) - val = val.transpose(0, 1).contiguous() - val = val.view(*val_shape) - - val = safe_transpose(val) - return torch2np(val, np_weight_data_type) - def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, model_training_args, transformer_model_list, ckpt_ver, np_weight_data_type): val = safe_transpose(transformer_model_list[0][key]) val = torch2np(val, np_weight_data_type) @@ -238,9 +221,6 @@ def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, mod split_vals[j].tofile(saved_path.as_posix()) elif (key.find("mlp.dense_h_to_4h.weight") != -1): - val = preprocess_h_to_4h(transformer_model_list[0][key], - model_training_args.tensor_model_parallel_size, - np_weight_data_type) gate_weight, up_weight = np.split(val, 2, axis=-1) split_gate_weight = np.split(gate_weight, factor, axis=-1) @@ -454,6 +434,7 @@ def convert_checkpoint(args): config["llama"]["num_layer"] = str(model_training_args.num_layers) config["llama"]["max_pos_seq_len"] = str(model_training_args.max_position_embeddings) config["llama"]["vocab_size"] = str(model_training_args.padded_vocab_size) + config["llama"]["layernorm_eps"] = args.layernorm_eps config["llama"]["start_id"] = '0' config["llama"]["end_id"] = '1' config["llama"]["weight_data_type"] = args.weight_data_type @@ -499,6 +480,9 @@ def main(): parser.add_argument( "--weight-data-type", "-weight_data_type", choices=["fp32", "fp16"], default="fp32", help="" ) + parser.add_argument( + "--layernorm-eps", default="1e-05", type=str, help="rms layernorm eps", required=True + ) parser.add_argument( "--load-checkpoints-to-cpu", "-load_checkpoints_to_cpu", From abd1e4dd9e0feb853fae064ced9666789e109ade Mon Sep 17 00:00:00 2001 From: void-main Date: Sat, 6 May 2023 07:56:48 +0000 Subject: [PATCH 19/21] fix callback issue --- src/fastertransformer/models/llama/Llama.cc | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 37ee21b85..8e3f1cedf 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1114,19 +1114,18 @@ void Llama::setOutputTensors(std::unordered_map* o const size_t batch_size = output_tensors->at("output_ids").shape[0]; const size_t beam_width = output_tensors->at("output_ids").shape[1]; - uint* sequence_lengths = output_tensors->at("sequence_length").getPtr(); + int* sequence_lengths = output_tensors->at("sequence_length").getPtr(); const size_t max_prefix_soft_prompt_length = has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; + cudaAutoCpy(sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); if (input_tensors->at("input_ids").shape[1] == 0) { - invokeCudaD2DcpyConvert( - sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); // TODO: D2D sequence_lenghts if (beam_width > 1) { // For beam search, do gather_tree // take output_parent_ids as inter buffer invokeGatherTree(transposed_output_ids_buf_, - sequence_lengths_, + sequence_lengths, max_output_seq_len, batch_size, beam_width, @@ -1158,7 +1157,7 @@ void Llama::setOutputTensors(std::unordered_map* o // For sampling, it is equivalent to all parent ids are 0. gatherTreeParam param; param.beams = transposed_output_ids_buf_; - param.max_sequence_lengths = sequence_lengths_; + param.max_sequence_lengths = sequence_lengths; // add sequence_length 1 here because the sequence_length of time step t is t - 1 param.max_sequence_length_final_step = 1; param.max_time = max_output_seq_len; @@ -1176,8 +1175,6 @@ void Llama::setOutputTensors(std::unordered_map* o param.stream = stream_; param.output_ids = output_tensors->at("output_ids").getPtr(); invokeGatherTree(param); - invokeCudaD2DcpyConvert( - sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); sync_check_cuda_error(); } if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) { From 50fdb0c603f7740a80f93501d7dc29bda131552a Mon Sep 17 00:00:00 2001 From: void-main Date: Sun, 11 Jun 2023 06:48:57 +0000 Subject: [PATCH 20/21] fix name --- examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py index 6ebec96d3..99f573bb9 100644 --- a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py +++ b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py @@ -386,7 +386,7 @@ def convert_checkpoint(args): )) if j == training_pipeline_para_size - 1: w_e_head_list.append(torch2np( - m["model"]["word_embeddings_for_head"]["weight"], + m["model"]["language_model"]["output_layer"]["weight"], np_weight_data_type )) transformer_models.append(m["model"]["language_model"][megatron_gpt_key]) From dce9b65620bd8030386a2de027a9013c6c2cddb0 Mon Sep 17 00:00:00 2001 From: Binxuan Huang Date: Thu, 27 Jul 2023 17:18:12 -0700 Subject: [PATCH 21/21] add fusion-for-decoder-only for llama --- CMakeLists.txt | 6 +- examples/cpp/llama/CMakeLists.txt | 10 +- examples/cpp/llama/check_with_huggingface.py | 16 - .../cpp/llama/huggingface_llama_convert.py | 2 +- examples/cpp/llama/llama_config.ini | 2 +- examples/cpp/llama/llama_decoder.py | 1253 +++++++++++++++++ ...{llama_example.cc => llama_fid_example.cc} | 15 +- ...example.cc => llama_fid_triton_example.cc} | 2 +- examples/cpp/llama/start_ids.csv | 2 +- .../decoder_masked_multihead_attention.h | 6 +- ...er_masked_multihead_attention_template.hpp | 10 +- .../DecoderSelfAttentionLayer.cc | 6 +- .../DecoderSelfAttentionLayer.h | 1 + .../models/llama/CMakeLists.txt | 9 +- .../models/llama/LlamaDecoder.cc | 4 +- .../models/llama/{Llama.cc => LlamaFiD.cc} | 132 +- .../models/llama/{Llama.h => LlamaFiD.h} | 5 +- .../triton_backend/llama/CMakeLists.txt | 4 +- ...aTritonModel.cc => LlamaFiDTritonModel.cc} | 2 +- ...amaTritonModel.h => LlamaFiDTritonModel.h} | 2 +- .../llama/LlamaTritonModelInstance.cc | 8 +- .../llama/LlamaTritonModelInstance.h | 4 +- src/fastertransformer/utils/memory_utils.cu | 51 + src/fastertransformer/utils/memory_utils.h | 3 + 24 files changed, 1462 insertions(+), 93 deletions(-) delete mode 100644 examples/cpp/llama/check_with_huggingface.py create mode 100644 examples/cpp/llama/llama_decoder.py rename examples/cpp/llama/{llama_example.cc => llama_fid_example.cc} (97%) rename examples/cpp/llama/{llama_triton_example.cc => llama_fid_triton_example.cc} (99%) rename src/fastertransformer/models/llama/{Llama.cc => LlamaFiD.cc} (92%) rename src/fastertransformer/models/llama/{Llama.h => LlamaFiD.h} (98%) rename src/fastertransformer/triton_backend/llama/{LlamaTritonModel.cc => LlamaFiDTritonModel.cc} (99%) rename src/fastertransformer/triton_backend/llama/{LlamaTritonModel.h => LlamaFiDTritonModel.h} (98%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5eb40abe8..96b4f1818 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,7 @@ if(USE_TRITONSERVER_DATATYPE) add_definitions("-DUSE_TRITONSERVER_DATATYPE") endif() -set(CXX_STD "14" CACHE STRING "C++ standard") +set(CXX_STD "17" CACHE STRING "C++ standard") set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) @@ -238,7 +238,7 @@ if(BUILD_TF2) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) endif() -set(PYTHON_PATH "python" CACHE STRING "Python path") +set(PYTHON_PATH "python3" CACHE STRING "Python path") if(BUILD_PYT) execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');" RESULT_VARIABLE _PYTHON_SUCCESS @@ -348,7 +348,7 @@ add_library(transformer-shared SHARED $ $ $ - $ + $ $ $ $ diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt index cdf9033dd..33d04bef6 100644 --- a/examples/cpp/llama/CMakeLists.txt +++ b/examples/cpp/llama/CMakeLists.txt @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_executable(llama_example llama_example.cc) -target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart - Llama nvtx_utils gpt_example_utils word_list mpi_utils nccl_utils) +add_executable(llama_fid_example llama_fid_example.cc) +target_link_libraries(llama_fid_example PUBLIC -lcublas -lcublasLt -lcudart + LlamaFiD nvtx_utils gpt_example_utils word_list mpi_utils nccl_utils) -add_executable(llama_triton_example llama_triton_example.cc) -target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread +add_executable(llama_fid_triton_example llama_fid_triton_example.cc) +target_link_libraries(llama_fid_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread LlamaTritonBackend TransformerTritonBackend custom_ar_comm gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils) diff --git a/examples/cpp/llama/check_with_huggingface.py b/examples/cpp/llama/check_with_huggingface.py deleted file mode 100644 index d1f356cc1..000000000 --- a/examples/cpp/llama/check_with_huggingface.py +++ /dev/null @@ -1,16 +0,0 @@ -import transformers - -from transformers import LlamaForCausalLM, LlamaTokenizer - -tokenizer = LlamaTokenizer.from_pretrained('/data/llama-7b-hf') - -prompt = "Hey, are you consciours? Can you talk to me?" -inputs = tokenizer(prompt, return_tensors='pt') -model = LlamaForCausalLM.from_pretrained("/data/llama-7b-hf") -hf_config = vars(model.config) -print(hf_config) -generated_ids = model.forward(inputs.input_ids, output_hidden_states=True) -print(generated_ids) - -tokens = [0,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366] -print(tokenizer.decode(tokens)) diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py index 360795080..c70bc2e13 100644 --- a/examples/cpp/llama/huggingface_llama_convert.py +++ b/examples/cpp/llama/huggingface_llama_convert.py @@ -191,4 +191,4 @@ def split_and_convert(args): print("{}: {}".format(key, vars(args)[key])) print("========================================") - split_and_convert(args) + split_and_convert(args) \ No newline at end of file diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 6fa7117ca..35bf1e6d2 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -29,4 +29,4 @@ rotary_embedding = 128 vocab_size = 32000 start_id = 0 end_id = 1 -weight_data_type = fp16 +weight_data_type = fp16 \ No newline at end of file diff --git a/examples/cpp/llama/llama_decoder.py b/examples/cpp/llama/llama_decoder.py new file mode 100644 index 000000000..a99055a32 --- /dev/null +++ b/examples/cpp/llama/llama_decoder.py @@ -0,0 +1,1253 @@ +''' +This piece of Llama model code is copied from https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py +We changed the causal attention computation to pytorch's torch.nn.functional.scaled_dot_product_attention, +which has native flash attention support. After making the change, we observed 50% speed improvement on +a 7B lamma model. +Lines changed: line 322 - 348, line 646 - 648 +We besically discarded the causal attention mask creation and only passed is_causal parameter to the +scaled_dot_product_attention function. +''' + + +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch LLaMA model.""" +import math +from typing import List, Optional, Tuple, Union +from torch.nn import functional as F + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings + +from transformers.configuration_utils import PretrainedConfig + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer + +#logger = logging.get_logger(__name__) + +LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class LlamaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the LLaMA-7B. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + ```python + >>> from transformers import LlamaModel, LlamaConfig + >>> # Initializing a LLaMA llama-7b style configuration + >>> configuration = LlamaConfig() + >>> # Initializing a model from the llama-7b style configuration + >>> model = LlamaModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "llama" + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "LlamaConfig" + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +class LlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + ''' + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + return (self.weight * hidden_states).to(input_dtype) + ''' + def forward(self, hidden_states): + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +class LlamaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) + self.register_buffer("inv_freq", inv_freq) + + # Build here to make `torch.jit.trace` work. + self.max_seq_len_cached = max_position_embeddings + t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. + if seq_len > self.max_seq_len_cached: + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + return ( + self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + ) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + rot_dim = cos.shape[-1] + q, q_pass = q[...,:rot_dim], q[..., rot_dim:] + k, k_pass = k[...,:rot_dim], k[..., rot_dim:] + + cos = cos.squeeze((0, 1)) # [seq_len, dim] + sin = sin.squeeze((0, 1)) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return torch.cat((q_embed, q_pass), dim=-1), torch.cat((k_embed, k_pass), dim=-1) + + +class LlamaMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + ): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class LlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: LlamaConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = config.max_position_embeddings + self.dropout_rate = config.dropout if hasattr(config, 'dropout') else 0.0 + self.resid_dropout = nn.Dropout(self.dropout_rate) + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + if hasattr(self.config, 'rotary_percentage'): + rotary_percentage = config.rotary_percentage + else: + rotary_percentage = 1 + self.rotary_emb = LlamaRotaryEmbedding(int(self.head_dim * rotary_percentage), max_position_embeddings=self.max_position_embeddings) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask=attention_mask, dropout_p=self.dropout_rate if self.training else 0.0, is_causal=False) + + # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + # raise ValueError( + # f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" + # f" {attn_weights.size()}" + # ) + + # if attention_mask is not None: + # if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + # raise ValueError( + # f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + # ) + # attn_weights = attn_weights + attention_mask + # attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) + + # # upcast attention to fp32 + # attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + # attn_output = torch.matmul(attn_weights, value_states) + + # if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + # raise ValueError( + # f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + # f" {attn_output.size()}" + # ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.resid_dropout(self.o_proj(attn_output)) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class LlamaDecoderLayer(nn.Module): + def __init__(self, config: LlamaConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = LlamaAttention(config=config) + self.mlp = LlamaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + ) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +LLAMA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`LlamaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaPreTrainedModel(PreTrainedModel): + config_class = LlamaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, LlamaModel): + module.gradient_checkpointing = value + + +LLAMA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaModel(LlamaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`] + + Args: + config: LlamaConfig + """ + + def __init__(self, config: LlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class LMHead(nn.Linear): + def __init__(self, in_features, out_features, bias=True, device=None, dtype=None): + super().__init__(in_features, out_features, bias) + + def forward(self, input): + return super().forward(input.float()) + +class LlamaForCausalLM(LlamaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.model = LlamaModel(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + #self.lm_head = LMHead(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, LlamaForCausalLM + + >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you consciours? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # make generation compatible with transformers 4.23.1 and use_cache=True + if past_key_values is None: + past_key_values = kwargs.get('past', None) + + if past_key_values: + input_ids = input_ids[:, -1:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + +@add_start_docstrings( + """ + The LLaMa Model transformer with a sequence classification head on top (linear layer). + + [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + LLAMA_START_DOCSTRING, +) +class LlamaForSequenceClassification(LlamaPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = LlamaModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + + + + +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model", + }, + "tokenizer_file": { + "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json", + }, +} +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "hf-internal-testing/llama-tokenizer": 2048, +} + + +class LlamaTokenizer(PreTrainedTokenizer): + """ + Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + sp_model_kwargs=self.sp_model_kwargs, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for i, token in enumerate(tokens): + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special and i != 0: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = bos_token_id + token_ids_0 + eos_token_id + + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + bos_token_id = [1] if self.add_bos_token else [] + eos_token_id = [1] if self.add_eos_token else [] + + if token_ids_1 is None: + return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + return ( + bos_token_id + + ([0] * len(token_ids_0)) + + eos_token_id + + bos_token_id + + ([0] * len(token_ids_1)) + + eos_token_id + ) + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) + + if token_ids_1 is not None: + output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) + + return output \ No newline at end of file diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_fid_example.cc similarity index 97% rename from examples/cpp/llama/llama_example.cc rename to examples/cpp/llama/llama_fid_example.cc index a64c86bca..a9780b463 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_fid_example.cc @@ -16,7 +16,7 @@ #include "3rdparty/INIReader.h" #include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h" -#include "src/fastertransformer/models/llama/Llama.h" +#include "src/fastertransformer/models/llama/LlamaFiD.h" #include "src/fastertransformer/utils/mpi_utils.h" #include "src/fastertransformer/utils/nccl_utils.h" #include "src/fastertransformer/utils/nvtx_utils.h" @@ -103,7 +103,7 @@ void llama_example(const INIReader reader) const float len_penalty = reader.GetFloat("request", "len_penalty"); const float beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate"); const int min_length = reader.GetInteger("request", "min_length", 0); - const size_t request_batch_size = 1; // reader.GetInteger("request", "request_batch_size"); + const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); // The length of tokens we hope this model to generate const int request_output_len = reader.GetInteger("request", "request_output_len"); @@ -246,7 +246,7 @@ void llama_example(const INIReader reader) prefix_prompt_task_ids[i] = (num_tasks > 0) ? i % num_tasks : 0; } - const int total_output_len = max_input_len + request_output_len; + const int total_output_len = max_input_len * request_batch_size + request_output_len; cudaStream_t stream; cublasHandle_t cublas_handle; @@ -416,7 +416,8 @@ void llama_example(const INIReader reader) } cudaDeviceSynchronize(); mpi::barrier(); - + //output_tensors.output_ids + printDebugLayer(d_output_ids, request_batch_size * beam_width * total_output_len, "return"); POP_RANGE; ft_nvtx::resetScope(); @@ -465,9 +466,9 @@ void llama_example(const INIReader reader) ft_nvtx::setScope("total_time"); PUSH_RANGE("total time") - for (int i = 0; i < ite; ++i) { - gpt.forward(&output_tensors, &input_tensors, &gpt_weights); - } + // for (int i = 0; i < ite; ++i) { + // gpt.forward(&output_tensors, &input_tensors, &gpt_weights); + // } cudaDeviceSynchronize(); mpi::barrier(); diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_fid_triton_example.cc similarity index 99% rename from examples/cpp/llama/llama_triton_example.cc rename to examples/cpp/llama/llama_fid_triton_example.cc index e639ae9ca..dcdb5973c 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_fid_triton_example.cc @@ -16,7 +16,7 @@ #include "3rdparty/INIReader.h" #include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h" -#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" +#include "src/fastertransformer/triton_backend/llama/LlamaFiDTritonModel.h" #include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/fastertransformer/utils/custom_ar_comm.h" #include "src/fastertransformer/utils/mpi_utils.h" diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 35ddd915d..2bbcb6b21 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -5,4 +5,4 @@ 0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 -0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 \ No newline at end of file diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention.h b/src/fastertransformer/kernels/decoder_masked_multihead_attention.h index 5a768184c..a3086fcd2 100644 --- a/src/fastertransformer/kernels/decoder_masked_multihead_attention.h +++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention.h @@ -50,7 +50,7 @@ template struct Multihead_attention_params_base { - + int rotary_position = 0; // The output buffer. Dimensions B x D. T* out = nullptr; @@ -132,7 +132,7 @@ struct Multihead_attention_params: public Multihead_attention_params_base { // required in case of cross attention // will need it here till if constexpr in c++17 int* memory_length_per_sample = nullptr; - + // required in case of masked attention with different length const int* length_per_sample = nullptr; }; @@ -149,7 +149,7 @@ struct Multihead_attention_params: public Multihead_attention_params_ba // required in case of cross attention int* memory_length_per_sample = nullptr; - + // required in case of masked attention with different length const int* length_per_sample = nullptr; }; diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp index 8e7cb92a2..1de4b4b5f 100644 --- a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp +++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp @@ -1323,10 +1323,10 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params 0 && !params.neox_rotary_style) { if (handle_kv) { - apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, params.timestep - padd_len); + apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, ((params.rotary_position != 0)? params.rotary_position: params.timestep) - padd_len); } else { - apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, params.timestep - padd_len); + apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, ((params.rotary_position != 0)? params.rotary_position: params.timestep) - padd_len); } } else if (params.rotary_embedding_dim > 0 && params.neox_rotary_style) { @@ -1358,16 +1358,18 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params::Type; @@ -135,6 +136,7 @@ void fusedQKV_masked_attention_dispatch(const T* qkv_buf, params.ia3_value_weights = reinterpret_cast(ia3_value_weights); params.int8_mode = int8_mode; + params.rotary_position = rotary_position; if (int8_mode == 2) { params.qkv_scale_out = qkv_scale_out; params.attention_out_scale = attention_out_scale; @@ -177,7 +179,8 @@ void fusedQKV_masked_attention_dispatch(const T* qkv_buf, const T* ia3_value_weights, \ const float* qkv_scale_out, \ const float* attention_out_scale, \ - const int int8_mode, \ + const int int8_mode, \ + const int rotary_position, \ cudaStream_t stream) INSTANTIATE_FUSEDQKV_MASKED_ATTENTION_DISPATCH(float); @@ -611,6 +614,7 @@ void DecoderSelfAttentionLayer::forward(TensorMap* output_tens int8_mode_ == 2 ? attention_weights->query_weight.scale_out : nullptr, int8_mode_ == 2 ? attention_weights->attention_output_weight.scale : nullptr, int8_mode_, + input_tensors->getVal("rotary_position"), stream_); sync_check_cuda_error(); diff --git a/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h b/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h index 1b1644e64..f14ffd42e 100644 --- a/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h @@ -183,6 +183,7 @@ void fusedQKV_masked_attention_dispatch(const T* qkv_buf, const float* qkv_scale_out, const float* attention_out_scale, const int int8_mode, + const int rotary_position, cudaStream_t stream); } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt index 88ad2ad42..61cd8e3ba 100644 --- a/src/fastertransformer/models/llama/CMakeLists.txt +++ b/src/fastertransformer/models/llama/CMakeLists.txt @@ -52,14 +52,15 @@ set_property(TARGET LlamaWeight PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET LlamaWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(LlamaWeight PUBLIC LlamaDecoderLayerWeight cuda_utils logger) -add_library(Llama STATIC Llama.cc) -set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(Llama PUBLIC -lcudart +add_library(LlamaFiD STATIC LlamaFiD.cc) +set_property(TARGET LlamaFiD PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LlamaFiD PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LlamaFiD PUBLIC -lcudart LlamaDecoder LlamaContextDecoder decoding_kernels gpt_kernels + memory_utils DynamicDecodeLayer BaseBeamSearchLayer bert_preprocess_kernels diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc index a5cffa731..f1990b21b 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaDecoder.cc @@ -208,13 +208,14 @@ void LlamaDecoder::forward(std::unordered_map* // Here, local_batch_size contains the beam_width, so local_batch_size / beam_width // is real local_batch_size. // masked_tokens[local_batch_size, memory_len] + // rotary_position [1] on cpu // output tensors: // decoder_output [local_batch_size, hidden_dimension], // key_cache [num_layer, batch_size, head_num, size_per_head // x, memory_len, x] // value_cache [num_layer, batch_size, head_num, memory_len, size_per_head] - FT_CHECK(input_tensors->size() == 11); + FT_CHECK(input_tensors->size() == 12); FT_CHECK(output_tensors->size() == 3); const DataType data_type = getTensorType(); @@ -289,6 +290,7 @@ void LlamaDecoder::forward(std::unordered_map* {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &gpt_decoder_layer_weight->at(l)->self_attention_weights); diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/LlamaFiD.cc similarity index 92% rename from src/fastertransformer/models/llama/Llama.cc rename to src/fastertransformer/models/llama/LlamaFiD.cc index 8e3f1cedf..5ddae88be 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/LlamaFiD.cc @@ -14,11 +14,13 @@ * limitations under the License. */ -#include "src/fastertransformer/models/llama/Llama.h" +#include "src/fastertransformer/models/llama/LlamaFiD.h" #include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" +#include "src/fastertransformer/utils/memory_utils.h" + #include namespace fastertransformer { @@ -64,7 +66,7 @@ void Llama::initialize() dynamic_decode_layer_ = new DynamicDecodeLayer(vocab_size_, vocab_size_padded_, - 0, // end_id, deprecated + end_id_, // end_id, deprecated stream_, cublas_wrapper_, allocator_, @@ -95,12 +97,15 @@ void Llama::allocateBuffer() template void Llama::allocateBuffer( - size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len) + size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len, size_t max_cache_len) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); const size_t batchxbeam = batch_size * beam_width; const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len * hidden_units_ / tensor_para_.world_size_; + + const size_t full_cache_size = (num_layer_ / pipeline_para_.world_size_) * beam_width * max_cache_len + * hidden_units_ / tensor_para_.world_size_; if (vocab_size_ != vocab_size_padded_) { padded_embedding_kernel_ = @@ -112,7 +117,7 @@ void Llama::allocateBuffer( } input_attention_mask_ = (T*)(allocator_->reMalloc( - input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false)); + input_attention_mask_, sizeof(T) * batchxbeam * max_cache_seq_len * max_cache_seq_len, false)); decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); decoder_output_buf_ = (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false)); @@ -127,7 +132,9 @@ void Llama::allocateBuffer( sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false)); key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true)); + key_cache_full = (T*)(allocator_->reMalloc(key_cache_full, sizeof(T) * full_cache_size * 2, true)); value_cache_ = key_cache_ + self_cache_size; + value_cache_full = key_cache_full + full_cache_size; if (beam_width > 1) { cache_indirections_[0] = (int*)(allocator_->reMalloc(cache_indirections_[0], sizeof(int) * batchxbeam * max_seq_len * 2, true)); @@ -151,7 +158,7 @@ void Llama::allocateBuffer( output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true)); seq_limit_len_ = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false)); - masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true)); + masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_len, true)); start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false)); end_ids_buf_ = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false)); @@ -190,6 +197,7 @@ void Llama::freeBuffer() allocator_->free((void**)(&sequence_lengths_)); allocator_->free((void**)(&key_cache_)); + allocator_->free((void**)(&key_cache_full)); if (cache_indirections_[0] != nullptr) { allocator_->free((void**)(&cache_indirections_)[0]); } @@ -220,6 +228,7 @@ void Llama::freeBuffer() } } + template Llama::Llama(size_t head_num, size_t size_per_head, @@ -533,25 +542,19 @@ void Llama::forward(std::unordered_map* output_ten const size_t max_prefix_soft_prompt_length = has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; const size_t limit_len_offset = max_prefix_soft_prompt_length + (max_input_length == 0 ? 1 : 0); + // max_input_len * request_batch_size + request_output_len; const size_t max_output_seq_len = input_tensors->at("output_seq_len").max() + limit_len_offset; + // max_input_len * request_batch_size + request_output_len; const size_t max_seq_len = max_output_seq_len; + // max cache seq len should include max prefix prompt length as it has k/v states - const size_t max_cache_seq_len = max_output_seq_len + max_prefix_prompt_length; - if (max_cache_seq_len < max_seq_len) { - FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). " - "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.", - max_cache_seq_len, - max_seq_len); - } - else if (max_cache_seq_len > max_seq_len) { - FT_LOG_WARNING("max_cache_seq_len (%d) is larger than max_seq_len (%d). " - "This may lead to additional memory cost. Suggest to use smaller max_cache_seq_len.", - max_cache_seq_len, - max_seq_len); - } + // with N context we have to increase max_cache_seq_len by N * prefix length, TODO: reduce some wasted allocated memory + const size_t max_cache_len = max_output_seq_len; + const size_t max_cache_seq_len = max_input_length; + const cudaDataType_t gemm_data_type = getCudaDataType(); allocateBuffer( - batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + max_prefix_soft_prompt_length); + batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + max_prefix_soft_prompt_length, max_cache_len); setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size); sync_check_cuda_error(); @@ -570,13 +573,27 @@ void Llama::forward(std::unordered_map* output_ten size_per_head_ / (16 / sizeof(T)), max_cache_seq_len, 16 / sizeof(T)}; + + const std::vector full_k_cache_shape = {num_layer_ / pipeline_para_.world_size_, + beam_width, + local_head_num_, + size_per_head_ / (16 / sizeof(T)), + max_cache_len, + 16 / sizeof(T)}; + const std::vector self_v_cache_shape = {num_layer_ / pipeline_para_.world_size_, batch_size * beam_width, local_head_num_, max_cache_seq_len, size_per_head_}; - // initialize the output ids and parent ids + const std::vector full_v_cache_shape = {num_layer_ / pipeline_para_.world_size_, + beam_width, + local_head_num_, + max_cache_len, + size_per_head_}; + + // initialize the output ids and parent ids to zero cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_); cudaMemsetAsync(parent_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_); cudaMemsetAsync(masked_tokens_, false, sizeof(bool) * batch_size * beam_width * max_cache_seq_len, stream_); @@ -651,6 +668,7 @@ void Llama::forward(std::unordered_map* output_ten sync_check_cuda_error(); } + invokeBuildDecoderAttentionMask(input_attention_mask_, tiled_input_lengths_buf_, tiled_prompt_lengths_buf_, @@ -700,6 +718,7 @@ void Llama::forward(std::unordered_map* output_ten gpt_context_decoder_->forward( &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights); sync_check_cuda_error(); + max_input_length = batch_size * max_input_length; invokeDecodingInitialize(finished_buf_, sequence_lengths_, nullptr, @@ -707,7 +726,7 @@ void Llama::forward(std::unordered_map* output_ten start_ids_buf_, batch_size, beam_width, - max_input_length - 1, + max_input_length,// - 1, stream_); sync_check_cuda_error(); } @@ -783,26 +802,57 @@ void Llama::forward(std::unordered_map* output_ten input_tensors->at("input_lengths").getPtr(), // not_tiled tiled_prompt_lengths_buf_, max_cache_seq_len, - max_input_length + max_prefix_prompt_length, + max_input_length/batch_size + max_prefix_prompt_length, 0, batch_size, beam_width, stream_); - for (int step = max_input_length; step < (int)max_output_seq_len; step++) { + + // transpose key_cache_ to key_cache_full + // transpose value_cache_ to value_cache_full + + invokeFlattenKV(key_cache_full, + key_cache_, + num_layer_ / pipeline_para_.world_size_, + batch_size * beam_width, + local_head_num_ * size_per_head_ / (16 / sizeof(T)), + max_cache_seq_len, + 16 / sizeof(T), + batch_size, + (max_cache_len - batch_size * max_cache_seq_len) + ); + + invokeFlattenKV(value_cache_full, + value_cache_, + num_layer_ / pipeline_para_.world_size_, + batch_size * beam_width, + local_head_num_, + max_cache_seq_len, + size_per_head_, + batch_size, + (max_cache_len - batch_size * max_cache_seq_len) + ); + + // set decoder start token id + cudaMemsetAsync(output_ids_buf_ + batch_size * max_input_length, start_id_, 1 , stream_); + + int rotary_position = max_input_length / batch_size; + + for (int step = max_input_length + 1; step < (int)max_output_seq_len; step++) { const int src_indir_idx = (step - max_input_length) % 2; const int tgt_indir_idx = 1 - src_indir_idx; - const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_); - FT_CHECK(batch_size % local_batch_size == 0); - const size_t iteration_num = batch_size / local_batch_size; - *generation_should_stop_ = true; + const size_t local_batch_size = 1; + const size_t iteration_num = 1; + *generation_should_stop_ = true; + for (uint ite = 0; ite < iteration_num; ++ite) { const int id_offset = ite * local_batch_size * beam_width; const int hidden_units_offset = id_offset * hidden_units_; const int vocab_size_units_offset = id_offset * vocab_size_padded_; - + if (!(max_input_length > 1 && step == max_input_length)) { if (pipeline_para_.rank_ == 0) { invokeEmbeddingLookupPosEncodingPadCount(decoder_input_buf_ + hidden_units_offset, @@ -819,6 +869,7 @@ void Llama::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); } + std::unordered_map decoder_input_tensors{ {"decoder_input", Tensor{MEMORY_GPU, @@ -852,20 +903,26 @@ void Llama::forward(std::unordered_map* output_ten {"masked_tokens", Tensor{MEMORY_GPU, TYPE_BOOL, - {local_batch_size * beam_width, max_cache_seq_len}, - masked_tokens_ + id_offset * max_cache_seq_len}}}; + {local_batch_size * beam_width, max_cache_len}, + masked_tokens_ + id_offset * max_cache_len}}, + {"rotary_position", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &rotary_position}}, + }; std::unordered_map decoder_output_tensors{ {"decoder_output", Tensor{MEMORY_GPU, data_type, {local_batch_size * beam_width, hidden_units_}, decoder_output_buf_ + hidden_units_offset}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}}, - {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}}; + {"key_cache", Tensor{MEMORY_GPU, data_type, full_k_cache_shape, key_cache_full}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, full_v_cache_shape, value_cache_full}}}; gpt_decoder_->forward( &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights); + rotary_position += 1; + } + // set the decode start id to be 1 for the edge case of BOS == EOS (GPTNeoX tokenizer) + if ((step == max_input_length + 1) && (start_id_ == end_id_)){ + cudaMemsetAsync(output_ids_buf_ + batch_size * max_input_length, 1, 1 , stream_); } - if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) { invokeGeneralT5LayerNorm(normed_decoder_output_buf_ + hidden_units_offset, decoder_output_buf_ + hidden_units_offset, @@ -967,6 +1024,7 @@ void Llama::forward(std::unordered_map* output_ten // common outputs bool subbatch_should_stop = false; + std::unordered_map dynamic_decode_output_tensors{ {"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids_buf_}}, @@ -1038,6 +1096,13 @@ void Llama::forward(std::unordered_map* output_ten if (*generation_should_stop_) { break; } + //TODO: hack to stop generation + // need to pass correct parameters to dynamic_decode_layer_ + bool* is_finished = new bool[1]; + cudaD2Hcpy(is_finished, finished_buf_, 1); + if (*is_finished){ + break; + } if (token_generated_cb_ && (step + 1) % token_generated_cb_step_ == 0 && step + 1 < (int)max_output_seq_len) { setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); sendTensorsToFirstPipelineNode(output_tensors, input_tensors); @@ -1153,7 +1218,6 @@ void Llama::setOutputTensors(std::unordered_map* o } } else { - // For sampling, it is equivalent to all parent ids are 0. gatherTreeParam param; param.beams = transposed_output_ids_buf_; diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/LlamaFiD.h similarity index 98% rename from src/fastertransformer/models/llama/Llama.h rename to src/fastertransformer/models/llama/LlamaFiD.h index f318e6947..d64ebb674 100644 --- a/src/fastertransformer/models/llama/Llama.h +++ b/src/fastertransformer/models/llama/LlamaFiD.h @@ -25,6 +25,7 @@ #include "src/fastertransformer/models/llama/LlamaWeight.h" #include "src/fastertransformer/utils/custom_ar_comm.h" #include "src/fastertransformer/utils/prompt_learning.h" +#include "src/fastertransformer/utils/memory_utils.h" namespace fastertransformer { @@ -75,7 +76,7 @@ class Llama: public BaseLayer { void allocateBuffer() override; void allocateBuffer( - size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len); + size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len, size_t max_cache_len); void freeBuffer() override; void initialize(); @@ -102,7 +103,9 @@ class Llama: public BaseLayer { uint32_t* seq_limit_len_ = nullptr; T* key_cache_; + T* key_cache_full; T* value_cache_; + T* value_cache_full; int* cache_indirections_[2] = {nullptr, nullptr}; // prompt_learning weight_batch ptrs diff --git a/src/fastertransformer/triton_backend/llama/CMakeLists.txt b/src/fastertransformer/triton_backend/llama/CMakeLists.txt index d5ba5547e..ef2e7a5ec 100644 --- a/src/fastertransformer/triton_backend/llama/CMakeLists.txt +++ b/src/fastertransformer/triton_backend/llama/CMakeLists.txt @@ -15,11 +15,11 @@ cmake_minimum_required(VERSION 3.8) set(parallel_gpt_triton_backend_files - LlamaTritonModel.cc + LlamaFiDTritonModel.cc LlamaTritonModelInstance.cc ) add_library(LlamaTritonBackend STATIC ${parallel_gpt_triton_backend_files}) set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON) -target_link_libraries(LlamaTritonBackend PRIVATE TransformerTritonBackend Llama tensor memory_utils -lcublasLt) +target_link_libraries(LlamaTritonBackend PRIVATE TransformerTritonBackend LlamaFiD tensor memory_utils -lcublasLt) target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14) diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaFiDTritonModel.cc similarity index 99% rename from src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc rename to src/fastertransformer/triton_backend/llama/LlamaFiDTritonModel.cc index ea34983e4..a1d6a2f35 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc +++ b/src/fastertransformer/triton_backend/llama/LlamaFiDTritonModel.cc @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" +#include "src/fastertransformer/triton_backend/llama/LlamaFiDTritonModel.h" #include "3rdparty/INIReader.h" #include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h b/src/fastertransformer/triton_backend/llama/LlamaFiDTritonModel.h similarity index 98% rename from src/fastertransformer/triton_backend/llama/LlamaTritonModel.h rename to src/fastertransformer/triton_backend/llama/LlamaFiDTritonModel.h index da5a277cd..4d0ba82d3 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h +++ b/src/fastertransformer/triton_backend/llama/LlamaFiDTritonModel.h @@ -16,7 +16,7 @@ #pragma once -#include "src/fastertransformer/models/llama/Llama.h" +#include "src/fastertransformer/models/llama/LlamaFiD.h" #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/fastertransformer/utils/cuda_utils.h" #include "src/fastertransformer/utils/custom_ar_comm.h" diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc index e46adf87d..fc0759ccf 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc @@ -67,7 +67,7 @@ std::unordered_map LlamaTritonModelInstance::convert h_total_output_lengths_ = reinterpret_cast(malloc(request_batch_size * sizeof(uint32_t))); for (int i = 0; i < request_batch_size; ++i) { h_total_output_lengths_[i] = - reinterpret_cast(input_tensors->at("request_output_len").data)[i] + input_data_len; + reinterpret_cast(input_tensors->at("request_output_len").data)[i] + request_batch_size * input_data_len; } std::unordered_map ft_input_tensors = std::unordered_map{ @@ -170,7 +170,7 @@ LlamaTritonModelInstance::forward(std::shared_ptrat("request_output_len").data, (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]); - const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1]; + const uint32_t total_output_len = max_request_output_len + request_batch_size * input_tensors->at("input_ids").shape[1]; const uint32_t beam_width = input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1; @@ -181,12 +181,12 @@ LlamaTritonModelInstance::forward(std::shared_ptr output_tensors = std::unordered_map{ {"output_ids", ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_UINT32, + ft::TYPE_INT32, std::vector{request_batch_size, beam_width, total_output_len}, d_output_ids_}}, {"sequence_length", ft::Tensor{ft::MEMORY_GPU, - ft::TYPE_UINT32, + ft::TYPE_INT32, std::vector{request_batch_size, beam_width}, d_sequence_lengths_}}}; diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h index a75e0692d..8df0261d5 100644 --- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h +++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h @@ -16,8 +16,8 @@ #pragma once -#include "src/fastertransformer/models/llama/Llama.h" -#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" +#include "src/fastertransformer/models/llama/LlamaFiD.h" +#include "src/fastertransformer/triton_backend/llama/LlamaFiDTritonModel.h" #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include diff --git a/src/fastertransformer/utils/memory_utils.cu b/src/fastertransformer/utils/memory_utils.cu index 134224a09..5b62ebb50 100644 --- a/src/fastertransformer/utils/memory_utils.cu +++ b/src/fastertransformer/utils/memory_utils.cu @@ -22,6 +22,7 @@ #include #include #include +#include "cuda_fp16.h" namespace fastertransformer { @@ -748,6 +749,55 @@ __global__ void transpose0213(T_OUT* dst, T_IN* src, const int dim0, const int d } } +template +__global__ void flattenKV(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2, const int dim3, const int dim4, const int n_context, const int dim3_diff) +{ + // src permutation: [dim0, dim1, dim2, dim3, dim4] + // dst permutation: [dim0, dim1/n_context, dim2, dim3*n_context + dim3_diff, dim4] + for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3 * dim4; + tid += blockDim.x * gridDim.x) { + + int tmp_idx = tid; + const int new_dim1 = dim1 / n_context; + const int new_dim3 = dim3 * n_context + dim3_diff; + + const int dim_4_idx = tmp_idx % dim4; + tmp_idx = (tmp_idx - dim_4_idx) / dim4; + const int dim_3_idx = tmp_idx % dim3; + tmp_idx = (tmp_idx - dim_3_idx) / dim3; + const int dim_2_idx = tmp_idx % dim2; + tmp_idx = (tmp_idx - dim_2_idx) / dim2; + const int dim_1_idx = tmp_idx % dim1; + tmp_idx = (tmp_idx - dim_1_idx) / dim1; + const int dim_0_idx = tmp_idx % dim0; + + const int new_dim_1_idx = dim_1_idx / n_context; + const int new_dim_3_idx = dim3 * (dim_1_idx % n_context) + dim_3_idx; + //dst[dim_0_idx * dim2 * dim1 * dim3 * dim4 + dim_2_idx * (dim1 * dim3 * dim4) + dim_1_idx * dim3 * dim4 + dim_3_idx * dim4 + dim_3_idx] = src[tid]; + dst[dim_0_idx * new_dim1 * dim2 * new_dim3 * dim4 + + new_dim_1_idx * dim2 * new_dim3 * dim4 + + dim_2_idx * new_dim3 * dim4 + + new_dim_3_idx * dim4 + + dim_4_idx + ] = src[tid]; + } +} +template +void invokeFlattenKV(T* dst, T* src, const int dim0, const int dim1, const int dim2, const int dim3, const int dim4, const int n_context, const int dim3_diff) +{ + flattenKV<<<256, 256>>>(dst, src, dim0, dim1, dim2, dim3, dim4, n_context, dim3_diff); +} +#ifdef ENABLE_FP8 +template void invokeFlattenKV( + __nv_fp8_e4m3* dst, __nv_fp8_e4m3* src, const int dim0, const int dim1, const int dim2, const int dim3, const int dim4, const int n_context, const int dim3_diff); +#endif // ENABLE_FP8 +#ifdef ENABLE_BF16 +template void invokeFlattenKV( + __nv_bfloat16* dst, __nv_bfloat16* src, const int dim0, const int dim1, const int dim2, const int dim3, const int dim4, const int n_context, const int dim3_diff); +#endif // ENABLE_BF16 +template void invokeFlattenKV(float* dst, float* src, const int dim0, const int dim1, const int dim2, const int dim3, const int dim4, const int n_context, const int dim3_diff); +template void invokeFlattenKV(__half* dst, __half* src, const int dim0, const int dim1, const int dim2, const int dim3, const int dim4, const int n_context, const int dim3_diff); + template void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3) { @@ -768,6 +818,7 @@ template void invokeInPlaceTranspose0213( template void invokeInPlaceTranspose0213( float* data, float* workspace, const int dim0, const int dim1, const int dim2, const int dim3); + template __global__ void transpose102(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2) { diff --git a/src/fastertransformer/utils/memory_utils.h b/src/fastertransformer/utils/memory_utils.h index 9efd53f53..07393c3d3 100644 --- a/src/fastertransformer/utils/memory_utils.h +++ b/src/fastertransformer/utils/memory_utils.h @@ -105,6 +105,9 @@ void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim template void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3); +template +void invokeFlattenKV(T* dst, T* src, const int dim0, const int dim1, const int dim2, const int dim3, const int dim4, const int n_context, const int dim3_diff); + template void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2);