From 072b50b5d2a3fde93cddd23a2edaed08c74e5808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=99=93=E9=BE=99?= <1225386395@qq.com> Date: Thu, 25 Apr 2024 10:19:48 +0800 Subject: [PATCH] optimize the way of appending. (#402) Co-authored-by: wheatxzhang --- finetune/run_c3.py | 8 ++--- finetune/run_chid.py | 6 ++-- finetune/run_classifier.py | 8 ++--- finetune/run_classifier_multi_label.py | 8 ++--- finetune/run_classifier_prompt.py | 8 ++--- finetune/run_cmrc.py | 8 ++--- finetune/run_dbqa.py | 8 ++--- finetune/run_ner.py | 10 +++--- finetune/run_regression.py | 8 ++--- finetune/run_text2text.py | 14 ++++---- uer/utils/dataloader.py | 46 ++++++++++---------------- 11 files changed, 60 insertions(+), 72 deletions(-) diff --git a/finetune/run_c3.py b/finetune/run_c3.py index 562e9e8e..3751992a 100644 --- a/finetune/run_c3.py +++ b/finetune/run_c3.py @@ -102,10 +102,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset[-1][0].append(src) dataset[-1][2].append(seg) diff --git a/finetune/run_chid.py b/finetune/run_chid.py index a57086d8..8ef1aadc 100644 --- a/finetune/run_chid.py +++ b/finetune/run_chid.py @@ -109,9 +109,9 @@ def read_dataset(args, data_path, answer_path): src = args.tokenizer.convert_tokens_to_ids(tokens)[: args.seq_length] seg = [0] * len(src) - while len(src) < args.seq_length: - src.append(0) - seg.append(0) + if len(src) < args.seq_length: + src += [0] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset[-1][0].append(src) dataset[-1][2].append(seg) diff --git a/finetune/run_classifier.py b/finetune/run_classifier.py index 6b205542..2bc558f4 100644 --- a/finetune/run_classifier.py +++ b/finetune/run_classifier.py @@ -160,10 +160,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) if args.soft_targets and "logits" in columns.keys(): dataset.append((src, tgt, seg, soft_tgt)) else: diff --git a/finetune/run_classifier_multi_label.py b/finetune/run_classifier_multi_label.py index f87f983f..11c23e34 100644 --- a/finetune/run_classifier_multi_label.py +++ b/finetune/run_classifier_multi_label.py @@ -105,10 +105,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset.append((src, tgt, seg)) diff --git a/finetune/run_classifier_prompt.py b/finetune/run_classifier_prompt.py index a634e8ff..72fb85b6 100644 --- a/finetune/run_classifier_prompt.py +++ b/finetune/run_classifier_prompt.py @@ -104,10 +104,10 @@ def read_dataset(args, path): src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) tgt = [0] * len(src) # Ignore the sentence which the answer is not in a sequence if mask_position >= args.seq_length: diff --git a/finetune/run_cmrc.py b/finetune/run_cmrc.py index 2032a96c..67260dae 100644 --- a/finetune/run_cmrc.py +++ b/finetune/run_cmrc.py @@ -116,10 +116,10 @@ def convert_examples_to_dataset(args, examples): src_b = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(span_context) + [SEP_TOKEN]) src = src_a + src_b seg = [1] * len(src_a) + [2] * len(src_b) - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset.append((src, seg, start_position, end_position, answers, question_id, len(question), doc_span_index, start_offset)) return dataset diff --git a/finetune/run_dbqa.py b/finetune/run_dbqa.py index fb2e45fb..d5147770 100644 --- a/finetune/run_dbqa.py +++ b/finetune/run_dbqa.py @@ -41,10 +41,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset.append((src, tgt, seg, qid)) return dataset diff --git a/finetune/run_ner.py b/finetune/run_ner.py index 75685a68..ec6e04c8 100644 --- a/finetune/run_ner.py +++ b/finetune/run_ner.py @@ -110,11 +110,11 @@ def read_dataset(args, path): src = src[: args.seq_length] tgt = tgt[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - tgt.append(args.labels_num - 1) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + tgt += [args.labels_num - 1] * (args.seq_length - len(tgt)) + seg += [0] * (args.seq_length - len(seg)) dataset.append([src, tgt, seg]) return dataset diff --git a/finetune/run_regression.py b/finetune/run_regression.py index ba979828..ea1172af 100644 --- a/finetune/run_regression.py +++ b/finetune/run_regression.py @@ -73,10 +73,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset.append((src, tgt, seg)) return dataset diff --git a/finetune/run_text2text.py b/finetune/run_text2text.py index d2ebc927..21887050 100644 --- a/finetune/run_text2text.py +++ b/finetune/run_text2text.py @@ -95,13 +95,13 @@ def read_dataset(args, path): tgt_seg = tgt_seg[: args.tgt_seq_length] tgt_out = tgt_in[1:] + [PAD_ID] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) - while len(tgt_in) < args.tgt_seq_length: - tgt_in.append(PAD_ID) - tgt_out.append(PAD_ID) - tgt_seg.append(0) + if len(src) < args.seq_length: + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) + if len(tgt_in) < args.tgt_seq_length: + tgt_in += [PAD_ID] * (args.tgt_seq_length - len(tgt_in)) + tgt_out += [PAD_ID] * (args.tgt_seq_length - len(tgt_out)) + tgt_seg += [0] * (args.tgt_seq_length - len(tgt_seg)) dataset.append((src, tgt_in, tgt_out, seg, tgt_seg)) diff --git a/uer/utils/dataloader.py b/uer/utils/dataloader.py index c77dc648..3bc02eb6 100644 --- a/uer/utils/dataloader.py +++ b/uer/utils/dataloader.py @@ -73,8 +73,7 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num if len(ins) == 4: src.append(src_single) @@ -123,8 +122,7 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num if len(ins) == 3: src.append(src_single) @@ -175,8 +173,7 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num src.append(src_single[:-1]) tgt.append(src_single[1:]) seg.append([1] * ins[1][0] + [0] * (len(src_single) - 1 - ins[1][0])) @@ -206,10 +203,9 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] tgt_forward_single, tgt_backward_single = ins[1], ins[2] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) - tgt_forward_single.append(self.vocab.get(PAD_TOKEN)) - tgt_backward_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num + tgt_forward_single += [self.vocab.get(PAD_TOKEN)] * pad_num + tgt_backward_single += [self.vocab.get(PAD_TOKEN)] * pad_num src.append(src_single) tgt_forward.append(tgt_forward_single) tgt_backward.append(tgt_backward_single) @@ -241,11 +237,9 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num tgt_single, pad_num = ins[1] - for _ in range(pad_num): - tgt_single.append(self.vocab.get(PAD_TOKEN)) + tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num src.append(src_single) tgt_in.append(tgt_single[:-1]) @@ -283,8 +277,7 @@ def __iter__(self): for _, ins in enumerate(instances): src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num if len(ins) == 3: tgt_single = ins[1] @@ -370,11 +363,9 @@ def __iter__(self): for _, ins in enumerate(instances): src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num tgt_single, pad_num = ins[1] - for _ in range(pad_num): - tgt_single.append(self.vocab.get(PAD_TOKEN)) + tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num src_single, _ = mask_seq(src_single, self.tokenizer, self.whole_word_masking, self.span_masking, self.span_geo_prob, self.span_max_length) @@ -436,9 +427,8 @@ def __iter__(self): elif len(seg_pos_single) == 2: seg_single = [1] * seg_pos_single[0] + [2] * seg_pos_single[1] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) - seg_single.append(0) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num + seg_single += [0] * pad_num src.append(src_single) tgt.append(ins[1]) @@ -468,9 +458,8 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] tgt_single = ins[1] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) - tgt_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num + tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num src.append(src_single) tgt.append(tgt_single) seg.append([1] * ins[2][0] + [2] * (ins[2][1] - ins[2][0]) + [0] * (len(src_single) - ins[2][1])) @@ -509,9 +498,8 @@ def __iter__(self): elif len(seg_pos_single) == 2: seg_single = [1] * seg_pos_single[0] + [2] * seg_pos_single[1] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) - seg_single.append(0) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num + seg_single += [0] * pad_num seg.append(seg_single) if len(ins) == 4 :