NVIDIA · binxuan · Apr 23, 2023 · Apr 24, 2023 · Apr 24, 2023 · Apr 25, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -101,7 +101,7 @@ if(USE_TRITONSERVER_DATATYPE)
   add_definitions("-DUSE_TRITONSERVER_DATATYPE")
 endif()
 
-set(CXX_STD "14" CACHE STRING "C++ standard")
+set(CXX_STD "17" CACHE STRING "C++ standard")
 
 set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
 
@@ -238,7 +238,7 @@ if(BUILD_TF2)
   add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
 endif()
 
-set(PYTHON_PATH "python" CACHE STRING "Python path")
+set(PYTHON_PATH "python3" CACHE STRING "Python path")
 if(BUILD_PYT)
   execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
                   RESULT_VARIABLE _PYTHON_SUCCESS
@@ -348,6 +348,12 @@ add_library(transformer-shared SHARED
   $<TARGET_OBJECTS:ParallelGptDecoderLayerWeight>
   $<TARGET_OBJECTS:ParallelGptTritonBackend>
   $<TARGET_OBJECTS:ParallelGptWeight>
+  $<TARGET_OBJECTS:LlamaFiD>
+  $<TARGET_OBJECTS:LlamaContextDecoder>
+  $<TARGET_OBJECTS:LlamaDecoder>
+  $<TARGET_OBJECTS:LlamaDecoderLayerWeight>
+  $<TARGET_OBJECTS:LlamaTritonBackend>
+  $<TARGET_OBJECTS:LlamaWeight>
   $<TARGET_OBJECTS:T5Common>
   $<TARGET_OBJECTS:T5Decoder>
   $<TARGET_OBJECTS:T5Decoding>
@@ -428,9 +434,9 @@ target_link_libraries(transformer-shared PUBLIC
   -lnvToolsExt
 )
 endif()
-  
+
 if (ENABLE_FP8)
-target_link_libraries(transformer-shared PUBLIC 
+target_link_libraries(transformer-shared PUBLIC
   $<TARGET_OBJECTS:BertFP8>
   $<TARGET_OBJECTS:BertFP8Weight>
   $<TARGET_OBJECTS:DecoderSelfAttentionFP8Layer>

diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
@@ -28,6 +28,8 @@ add_subdirectory(gptj)
 add_subdirectory(gptneox)
 add_subdirectory(multi_gpu_gpt)
 
+add_subdirectory(llama)
+
 if(ENABLE_FP8)
     add_subdirectory(gpt_fp8)
     add_subdirectory(bert_fp8)

diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(llama_fid_example llama_fid_example.cc)
+target_link_libraries(llama_fid_example PUBLIC -lcublas -lcublasLt -lcudart
+                      LlamaFiD nvtx_utils gpt_example_utils word_list mpi_utils nccl_utils)
+
+add_executable(llama_fid_triton_example llama_fid_triton_example.cc)
+target_link_libraries(llama_fid_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread
+                      LlamaTritonBackend TransformerTritonBackend custom_ar_comm
+                      gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils)
diff --git a/examples/cpp/llama/bad_words.csv b/examples/cpp/llama/bad_words.csv
@@ -0,0 +1,2 @@
+7768,3908
+1,2
diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import configparser
+import numpy as np
+from pathlib import Path
+
+import os
+from transformers import LlamaForCausalLM
+
+# using numpy extension: https://github.com/GreenWaves-Technologies/bfloat16
+# install the library with `pip install bfloat16`
+from bfloat16 import bfloat16
+
+def get_weight_data_type(data_type):
+    if data_type == "fp32":
+        return np.float32
+    elif data_type == "fp16":
+        return np.float16
+    elif data_type == "bf16":
+        return bfloat16
+    else:
+        assert False, f"Invalid weight data type {data_type}"
+
+
+def split_and_convert_process(saved_dir, factor, key, val):
+    if key.find("input_layernorm.weight") != -1 or key.find("post_attention_layernorm.weight") != -1:
+        # shared weights, only need to convert the weights of rank 0
+        saved_path = saved_dir + "/" + key + ".bin"
+        val.tofile(saved_path)
+    elif key.find("attention.dense.weight") != -1 or key.find("mlp.down_proj.weight") != -1:
+        split_vals = np.split(val, factor, axis=0)
+        for j in range(factor):
+            saved_path = saved_dir + "/" + key + ".%d.bin" % j
+            split_vals[j].tofile(saved_path)
+    elif key.find("mlp.gate_proj.weight") != -1 or key.find("mlp.up_proj.weight") != -1:
+        split_vals = np.split(val, factor, axis=-1)
+        for j in range(factor):
+            saved_path = saved_dir + "/" + key + ".%d.bin" % j
+            split_vals[j].tofile(saved_path)
+    elif key.find("attention.query_key_value.weight") != -1:
+        split_vals = np.split(val, factor, axis=-1)
+        for j in range(factor):
+            saved_path = saved_dir + "/" + key + ".%d.bin" % j
+            split_vals[j].tofile(saved_path)
+    else:
+        print("[ERROR] cannot find key '{}'".format(key))
+
+def split_and_convert(args):
+    saved_dir = args.saved_dir + "/%d-gpu/" % args.infer_gpu_num
+
+    if(os.path.exists(saved_dir) == False):
+        os.makedirs(saved_dir)
+
+    t_gpu_num = args.trained_gpu_num
+    i_gpu_num = args.infer_gpu_num
+    assert(i_gpu_num % t_gpu_num == 0)
+
+    factor = (int)(i_gpu_num / t_gpu_num)
+
+    # load position_embedding from rank 0
+    # model = torch.load(ckpt_name)
+    model = LlamaForCausalLM.from_pretrained(args.in_file)
+    hf_config = vars(model.config)
+    print(f"hf_config: {hf_config}")
+
+    print("named parameters:")
+    for name, param in model.named_parameters():
+        print(f"- {name}")
+
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_hidden_layers"]
+
+
+    np_weight_data_type = get_weight_data_type(args.weight_data_type)
+
+    try:
+        model_name = args.model_name
+        config = configparser.ConfigParser()
+        config['llama'] = {}
+        config['llama']['model_name'] = model_name
+        config['llama']["head_num"] = str(head_num)
+        config['llama']["size_per_head"] = str(head_size)
+        config['llama']["inter_size"] = str(hf_config["intermediate_size"])
+        config['llama']["num_layer"] = str(num_layers)
+        config['llama']["rotary_embedding"] = str(head_size)
+        config['llama']['layernorm_eps'] = str(hf_config["rms_norm_eps"])
+        config['llama']["vocab_size"] = str(hf_config["vocab_size"])
+        config['llama']["start_id"] = str(hf_config["bos_token_id"])
+        config['llama']["end_id"] = str(hf_config["eos_token_id"])
+        config['llama']["weight_data_type"] = args.weight_data_type
+
+        with open((Path(saved_dir) / f"config.ini").as_posix(), 'w') as configfile:
+            config.write(configfile)
+    except Exception as e:
+        print(f"Fail to save the config in config.ini.")
+        print(e)
+
+    param_to_weights = lambda param: param.detach().cpu().numpy().astype(np_weight_data_type)
+
+    # layer-wise weights, example:
+    #   - model.layers.0.self_attn.q_proj.weight
+    #   - model.layers.0.self_attn.k_proj.weight
+    #   - model.layers.0.self_attn.v_proj.weight
+    #   - model.layers.0.self_attn.o_proj.weight
+    #   - model.layers.0.mlp.gate_proj.weight
+    #   - model.layers.0.mlp.down_proj.weight
+    #   - model.layers.0.mlp.up_proj.weight
+    #   - model.layers.0.input_layernorm.weight
+    #   - model.layers.0.post_attention_layernorm.weight
+    for l in range(num_layers):
+        print(f"converting layer {l}")
+        # first merge QKV into a single weight
+        # concat direct to FT shape: [hidden_size, 3, head_num, head_size]
+        # copied from huggingface_gptj_ckpt_convert.py
+        qkv_weights = np.stack([
+            param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight']),
+            param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight']),
+            param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight']),
+        ])
+        qkv_weights = np.transpose(qkv_weights, (2, 0, 1))
+        qkv_weights_base_name = f'model.layers.{l}.attention.query_key_value.weight'
+        split_and_convert_process(saved_dir, factor, qkv_weights_base_name, qkv_weights)
+
+        # attention dense
+        o_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']).T
+        o_weight_base_name = f'model.layers.{l}.attention.dense.weight'
+        split_and_convert_process(saved_dir, factor, o_weight_base_name, o_weight)
+
+        # MLP
+        mlp_down_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']).T
+        mlp_down_base_name = f'model.layers.{l}.mlp.down_proj.weight'
+        split_and_convert_process(saved_dir, factor, mlp_down_base_name, mlp_down_weight)
+
+        mlp_gate_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']).T
+        mlp_gate_base_name = f'model.layers.{l}.mlp.gate_proj.weight'
+        split_and_convert_process(saved_dir, factor, mlp_gate_base_name, mlp_gate_weight)
+
+        mlp_up_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']).T
+        mlp_up_base_name = f'model.layers.{l}.mlp.up_proj.weight'
+        split_and_convert_process(saved_dir, factor, mlp_up_base_name, mlp_up_weight)
+
+        # LayerNorm
+        input_ln_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.input_layernorm.weight'])
+        input_ln_base_name = f'model.layers.{l}.input_layernorm.weight'
+        split_and_convert_process(saved_dir, factor, input_ln_base_name, input_ln_weight)
+
+        post_attn_ln_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight'])
+        post_attn_ln_base_name = f'model.layers.{l}.post_attention_layernorm.weight'
+        split_and_convert_process(saved_dir, factor, post_attn_ln_base_name, post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+
+    # final common weights
+    for name, param in model.named_parameters():
+        if name == 'model.embed_tokens.weight':
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.weight.bin")
+        elif name == 'model.norm.weight':
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.weight.bin")
+        elif name == 'lm_head.weight':
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('-saved_dir', '-o', type=str, help='file name of output file', required=True)
+    parser.add_argument('-in_file', '-i', type=str, help='file name of input checkpoint file', required=True)
+    parser.add_argument('-trained_gpu_num', '-t_g', type=int, help='How many gpus for inference', default=1)
+    parser.add_argument('-infer_gpu_num', '-i_g', type=int, help='How many gpus for inference', required=True)
+    parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16", "bf16"])
+    parser.add_argument('-model_name', '-m_n', type=str, help='model name', required=True)
+
+    args = parser.parse_args()
+    print("\n=============== Argument ===============")
+    for key in vars(args):
+        print("{}: {}".format(key, vars(args)[key]))
+    print("========================================")
+
+    split_and_convert(args)
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
@@ -0,0 +1,32 @@
+[ft_instance_hyperparameter]
+data_type=fp16
+enable_custom_all_reduce=0
+
+tensor_para_size=1
+pipeline_para_size=1
+
+model_name=llama_7b
+model_dir=/data/llama-7b-hf-converted/1-gpu
+
+[request]
+beam_width=1 # beam width for beam search
+top_k=1 ; k value for top k sampling
+top_p=0.0 ; p value for top p sampling
+temperature=1.0 ; Use for sampling
+repetition_penalty=1.0 ; Use for sampling
+presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
+len_penalty=0.0
+beam_search_diversity_rate=0.0
+request_batch_size=8 # determine by the request
+request_output_len=32 # determine by the request
+
+[llama_7b]
+head_num = 32
+size_per_head = 128
+inter_size = 11008
+num_layer = 32
+rotary_embedding = 128
+vocab_size = 32000
+start_id = 0
+end_id = 1
+weight_data_type = fp16