mindspore-courses · TJHsiao-Ni · Feb 14, 2025
diff --git a/2024-ascend-innovation-contest/topic2-finetune/first-phase/dian-team/code/format_ms.py b/2024-ascend-innovation-contest/topic2-finetune/first-phase/dian-team/code/format_ms.py
@@ -0,0 +1,30 @@
+import json
+import jsonlines
+
+with jsonlines.open(r"E:\DianWork\DianGPT\昇腾AI\dataset\trainset_v2.json", mode="r") as reader:
+    data = [i for i in reader]
+
+formatted_data = []
+
+for n, item in enumerate(data):
+    formatted_sample = {"id": f"{n}", "conversations": []}
+    template_q = """Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {query} ### Response:"""
+    q = template_q.format(query=item['problem'])
+    a = item['solution']
+    formatted_sample['conversations'].append(
+        {
+            "from": "human",
+            "value": q,
+        }
+    )
+    formatted_sample['conversations'].append(
+        {
+            "from": "gpt",
+            "value": a,
+        }
+    )
+    formatted_data.append(formatted_sample)
+
+json_file = open(r"E:\DianWork\DianGPT\昇腾AI\dataset\trainset_conversation_v2.json", "w", encoding="utf-8")
+
+json.dump(formatted_data, json_file, ensure_ascii=False, indent=4)
diff --git a/2024-ascend-innovation-contest/topic2-finetune/first-phase/dian-team/code/run_llama3_test.py b/2024-ascend-innovation-contest/topic2-finetune/first-phase/dian-team/code/run_llama3_test.py
@@ -0,0 +1,260 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Llama3 Train/Finetune/Predict scripts."""
+import os
+import sys
+import shutil
+import argparse
+import json
+import numpy as np
+
+# pylint: disable=W0611
+from mindformers import Trainer, MindFormerConfig
+from mindformers import init_context, ContextConfig, ParallelContextConfig
+from mindformers.tools.utils import check_in_modelarts, str2bool
+from mindformers.tools.logger import logger
+from mindformers.tools.cloud_adapter import cloud_monitor
+from mindformers.core.context import build_context
+from mindformers.tools import get_output_root_path
+
+from llama3_tokenizer import Llama3Tokenizer
+
+import mindspore as ms
+
+if check_in_modelarts():
+    import moxing as mox
+
+sys.path.insert(0, os.getcwd().split('research')[0])
+
+
+def clear_auto_trans_output(config):
+    """clear transformed_checkpoint and strategy"""
+    if check_in_modelarts():
+        obs_strategy_dir = os.path.join(config.remote_save_url, "strategy")
+        if mox.file.exists(obs_strategy_dir) and config.local_rank == 0:
+            mox.file.remove(obs_strategy_dir, recursive=True)
+            mox.file.make_dirs(obs_strategy_dir)
+        obs_transformed_ckpt_dir = os.path.join(config.remote_save_url, "transformed_checkpoint")
+        if mox.file.exists(obs_transformed_ckpt_dir) and config.local_rank == 0:
+            mox.file.remove(obs_transformed_ckpt_dir, recursive=True)
+            mox.file.make_dirs(obs_transformed_ckpt_dir)
+    else:
+        strategy_dir = os.path.join(get_output_root_path(), "strategy")
+        if os.path.exists(strategy_dir) and config.local_rank == 0:
+            shutil.rmtree(strategy_dir)
+            os.makedirs(strategy_dir, exist_ok=True)
+        transformed_ckpt_dir = os.path.join(get_output_root_path(), "transformed_checkpoint")
+        if os.path.exists(transformed_ckpt_dir) and config.local_rank == 0:
+            shutil.rmtree(transformed_ckpt_dir)
+            os.makedirs(transformed_ckpt_dir, exist_ok=True)
+
+
+def context_init(use_parallel=False, optimizer_parallel=False, device_id=0):
+    """init context for mindspore."""
+    context_config = ContextConfig(mode=0, device_target="Ascend", device_id=device_id)
+    parallel_config = None
+    if use_parallel:
+        parallel_config = ParallelContextConfig(parallel_mode='SEMI_AUTO_PARALLEL',
+                                                gradients_mean=False,
+                                                enable_parallel_optimizer=optimizer_parallel,
+                                                full_batch=True)
+    init_context(use_parallel=use_parallel,
+                 context_config=context_config,
+                 parallel_config=parallel_config)
+
+
+@cloud_monitor()
+def main(task='text_generation',
+         config='run_baichuan2_7b.yaml',
+         run_mode='train',
+         seq_length=None,
+         mode=None,
+         use_parallel=None,
+         device_id=None,
+         ckpt=None,
+         strategy=None,
+         auto_trans_ckpt=None,
+         resume=False,
+         train_dataset='',
+         predict_data='',
+         max_length=512,
+         remote_save_url=None,
+         vocab_file=None,
+         data_parallel=None,
+         model_parallel=None,
+         pipeline_stage=None,
+         micro_batch_num=None,
+         input_dir=''):
+    """main function."""
+
+    assert os.path.exists(config) and config.endswith(('.yaml', '.yml'))
+
+    # init config
+    config = MindFormerConfig(os.path.realpath(config))
+    if seq_length is not None:
+        config.model.model_config.seq_length = seq_length
+    if mode is not None:
+        config.context.mode = mode
+        if mode:
+            config.recompute_config.recompute = False
+    if use_parallel is not None:
+        config.use_parallel = use_parallel
+    if device_id is not None:
+        config.context.device_id = device_id
+    if ckpt is None:
+        ckpt = config.load_checkpoint
+    if strategy is not None and os.path.exists(strategy):
+        config.src_strategy_path_or_dir = strategy
+    if auto_trans_ckpt is not None:
+        config.auto_trans_ckpt = auto_trans_ckpt
+    if remote_save_url is not None:
+        config.remote_save_url = remote_save_url
+    if vocab_file is not None:
+        config.processor.tokenizer.vocab_file = vocab_file
+    if data_parallel is not None:
+        config.parallel_config.data_parallel = data_parallel
+    if model_parallel is not None:
+        config.parallel_config.model_parallel = model_parallel
+    if pipeline_stage is not None:
+        config.parallel_config.pipeline_stage = pipeline_stage
+    if micro_batch_num is not None:
+        config.parallel_config.micro_batch_num = micro_batch_num
+
+    if config.output_dir != './output':
+        raise ValueError("output_dir must be set to './output' and cannot be customized.")
+
+    # init context
+    build_context(config)
+
+    if run_mode in ['train', 'finetune']:
+        config.model.model_config.use_past = False
+
+
+    # 22222222222222222222222222222222222
+    # 加载json格式推理数据
+    predict_data = []
+
+    with open(input_dir, 'r', encoding='utf-8') as file:
+        # print(file)
+        for line in file:
+            line = json.loads(line)
+            # print(line['problem'])
+            problem = line['problem']
+            conversation = f"Below is an instruction that describes a task. Write a response that approppriately completes the request.\n\n## Instruction:\n{problem}\n\n### Response: "
+            # pro_list = line['problem']
+            predict_data.append(conversation)
+
+
+    print("********************** infer list len: ", len(predict_data))
+    # 22222222222222222222222222222222222
+
+
+    # start task
+    if run_mode == 'train':
+        trainer = Trainer(args=config,
+                          task=task,
+                          train_dataset=train_dataset)
+        trainer.train(train_checkpoint=ckpt, auto_trans_ckpt=config.auto_trans_ckpt, resume_training=resume)
+    elif run_mode == 'finetune':
+        trainer = Trainer(args=config,
+                          task=task,
+                          train_dataset=train_dataset)
+        trainer.finetune(finetune_checkpoint=ckpt, auto_trans_ckpt=config.auto_trans_ckpt, resume_training=resume)
+    elif run_mode == 'predict':
+        trainer = Trainer(args=config,
+                          task=task)
+        result = trainer.predict(input_data=predict_data,
+                                 predict_checkpoint=ckpt,
+                                 auto_trans_ckpt=config.auto_trans_ckpt,
+                                 max_length=int(max_length),
+                                 batch_size=4)
+        logger.info(result)
+
+        fpath = "result_npy.npy"
+        with open(fpath, 'wb') as f:
+            np.save(f, result)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--task', default='text_generation', type=str,
+                        help='set task type.')
+    parser.add_argument('--config', default='llama3/run_llama3_8b.yaml', type=str,
+                        help='set task type.')
+    parser.add_argument('--run_mode', default='train', type=str,
+                        help='set run mode for model.')
+    parser.add_argument('--seq_length', default=None, type=int,
+                        help='seq_length')
+    parser.add_argument('--use_parallel', default=None, type=str2bool,
+                        help='open parallel for model.')
+    parser.add_argument('--device_id', default=None, type=int,
+                        help='device id set when run on single card. Default: 0')
+    parser.add_argument('--mode', default=0, type=int,
+                        help='0--Graph Mode; 1--Pynative Mode')
+    parser.add_argument('--load_checkpoint', default=None, type=str,
+                        help='checkpoint name or dir to load.')
+    parser.add_argument('--src_strategy', default=None, type=str,
+                        help='strategy of load_checkpoint')
+    parser.add_argument('--auto_trans_ckpt', default=None, type=str2bool,
+                        help='whether to transform checkpoint to the checkpoint matching current distribute strategy.')
+    parser.add_argument('--resume', default=None, type=str2bool,
+                        help='whether resume training.')
+    parser.add_argument('--train_dataset', default='', type=str,
+                        help='set train dataset.')
+    parser.add_argument('--eval_dataset', default='', type=str,
+                        help='set eval dataset.')
+    parser.add_argument('--predict_data', default='', type=str, nargs='+',
+                        help='input predict data.')
+    parser.add_argument('--max_length', default=512, type=int,
+                        help='max length for predict output.')
+    parser.add_argument('--remote_save_url', default='', type=str,
+                        help='whether use optimizer parallel. Default: None')
+    parser.add_argument('--vocab_file', default=None, type=str,
+                        help='tokenizer model')
+    parser.add_argument('--dp', default=None, type=int,
+                        help='data parallel')
+    parser.add_argument('--mp', default=None, type=int,
+                        help='model parallel')
+    parser.add_argument('--pp', default=None, type=int,
+                        help='pipeline stage')
+    parser.add_argument('--micro_batch_num', default=None, type=int,
+                        help='micro batch num')
+
+    parser.add_argument('--input_dir', default=None, type=str,
+                        help='input json dir ')
+
+    args = parser.parse_args()
+
+    main(task=args.task,
+         config=args.config,
+         run_mode=args.run_mode,
+         seq_length=args.seq_length,
+         mode=args.mode,
+         use_parallel=args.use_parallel,
+         device_id=args.device_id,
+         ckpt=args.load_checkpoint,
+         strategy=args.src_strategy,
+         auto_trans_ckpt=args.auto_trans_ckpt,
+         resume=args.resume,
+         train_dataset=args.train_dataset,
+         predict_data=args.predict_data,
+         max_length=args.max_length,
+         remote_save_url=args.remote_save_url,
+         vocab_file=args.vocab_file,
+         data_parallel=args.dp,
+         model_parallel=args.mp,
+         pipeline_stage=args.pp,
+         micro_batch_num=args.micro_batch_num,
+         input_dir=args.input_dir)