-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
347 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
name: 同步代码到七牛 | ||
|
||
on: | ||
push: | ||
branches: [ main ] | ||
|
||
jobs: | ||
sync_data: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
- name: 执行同步脚本 | ||
if: ${{ steps.parse_body.outputs.sync_type == 'huggingface' }} | ||
run: | | ||
pip install -r requirements.txt | ||
for d in tools; do | ||
bash ./sync_hf.sh "$d" "playground/$d" | ||
done | ||
env: | ||
QUNIU_ACCESS_TOKEN: ${{ secrets.QUNIU_ACCESS_TOKEN }} | ||
QUNIU_SECRET_KEY: ${{ secrets.QUNIU_SECRET_KEY }} | ||
HF_TOKEN: ${{ secrets.HF_TOKEN }} |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# This configuration file was automatically generated by Gitpod. | ||
# Please adjust to your needs (see https://www.gitpod.io/docs/introduction/learn-gitpod/gitpod-yaml) | ||
# and commit this file to your remote git repository to share the goodness with others. | ||
|
||
# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart | ||
|
||
tasks: | ||
- init: pip install -r requirements.txt | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,314 @@ | ||
import argparse | ||
import os | ||
import re | ||
import zipfile | ||
|
||
import torch | ||
|
||
from transformers import AutoTokenizer, GPT2Config | ||
|
||
|
||
def recursive_print(name, val, spaces=0): | ||
# Format the message. | ||
if name is None: | ||
msg = None | ||
else: | ||
fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" | ||
msg = fmt.format(name) | ||
|
||
# Print and recurse (if needed). | ||
if isinstance(val, dict): | ||
if msg is not None: | ||
print(msg) | ||
for k in val.keys(): | ||
recursive_print(k, val[k], spaces + 2) | ||
elif isinstance(val, torch.Tensor): | ||
print(msg, ":", val.size()) | ||
else: | ||
print(msg, ":", val) | ||
|
||
|
||
def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size): | ||
# Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] | ||
# for compatibility with later versions of NVIDIA Megatron-LM. | ||
# The inverse operation is performed inside Megatron-LM to read checkpoints: | ||
# https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 | ||
# If param is the weight tensor of the self-attention block, the returned tensor | ||
# will have to be transposed one more time to be read by HuggingFace GPT2. | ||
input_shape = param.size() | ||
if checkpoint_version == 1.0: | ||
# version 1.0 stores [num_heads * hidden_size * num_splits, :] | ||
saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:] | ||
param = param.view(*saved_shape) | ||
param = param.transpose(0, 2) | ||
param = param.transpose(1, 2).contiguous() | ||
elif checkpoint_version >= 2.0: | ||
# other versions store [num_heads * num_splits * hidden_size, :] | ||
saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:] | ||
param = param.view(*saved_shape) | ||
param = param.transpose(0, 1).contiguous() | ||
param = param.view(*input_shape) | ||
return param | ||
|
||
|
||
#################################################################################################### | ||
|
||
|
||
def convert_megatron_checkpoint(args, input_state_dict, config): | ||
# The converted output model. | ||
output_state_dict = {} | ||
|
||
# old versions did not store training args | ||
ds_args = input_state_dict.get("args", None) | ||
if ds_args is not None: | ||
# do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint | ||
# from pprint import pprint | ||
# pprint(vars(ds_args)) | ||
|
||
config.vocab_size = ds_args.padded_vocab_size | ||
config.n_positions = ds_args.max_position_embeddings | ||
config.n_embd = ds_args.hidden_size | ||
config.n_layer = ds_args.num_layers | ||
config.n_head = ds_args.num_attention_heads | ||
config.n_inner = ds_args.ffn_hidden_size | ||
# pprint(config) | ||
|
||
# The number of heads. | ||
heads = config.n_head | ||
# The hidden_size per head. | ||
hidden_size_per_head = config.n_embd // config.n_head | ||
# Megatron-LM checkpoint version | ||
if "checkpoint_version" in input_state_dict.keys(): | ||
checkpoint_version = input_state_dict["checkpoint_version"] | ||
else: | ||
checkpoint_version = 0.0 | ||
|
||
# The model. | ||
model = input_state_dict["model"] | ||
# The language model. | ||
lm = model["language_model"] | ||
# The embeddings. | ||
embeddings = lm["embedding"] | ||
|
||
# The word embeddings. | ||
word_embeddings = embeddings["word_embeddings"]["weight"] | ||
# Truncate the embedding table to vocab_size rows. | ||
word_embeddings = word_embeddings[: config.vocab_size, :] | ||
output_state_dict["transformer.wte.weight"] = word_embeddings | ||
|
||
# The position embeddings. | ||
pos_embeddings = embeddings["position_embeddings"]["weight"] | ||
# Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size] | ||
n_positions = pos_embeddings.size(0) | ||
if n_positions != config.n_positions: | ||
raise ValueError( | ||
f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match" | ||
) | ||
# Store the position embeddings. | ||
output_state_dict["transformer.wpe.weight"] = pos_embeddings | ||
|
||
# The transformer. | ||
transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"] | ||
|
||
# The regex to extract layer names. | ||
layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)") | ||
|
||
# The simple map of names for "automated" rules. | ||
megatron_to_transformers = { | ||
"attention.dense": ".attn.c_proj.", | ||
"self_attention.dense": ".attn.c_proj.", | ||
"mlp.dense_h_to_4h": ".mlp.c_fc.", | ||
"mlp.dense_4h_to_h": ".mlp.c_proj.", | ||
} | ||
|
||
# Extract the layers. | ||
for key, val in transformer.items(): | ||
# Match the name. | ||
m = layer_re.match(key) | ||
|
||
# Stop if that's not a layer | ||
if m is None: | ||
break | ||
|
||
# The index of the layer. | ||
layer_idx = int(m.group(1)) | ||
# The name of the operation. | ||
op_name = m.group(2) | ||
# Is it a weight or a bias? | ||
weight_or_bias = m.group(3) | ||
|
||
# The name of the layer. | ||
layer_name = f"transformer.h.{layer_idx}" | ||
|
||
# For layernorm(s), simply store the layer norm. | ||
if op_name.endswith("_norm"): | ||
ln_name = "ln_1" if op_name.startswith("input") else "ln_2" | ||
output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val | ||
|
||
# Transpose the QKV matrix. | ||
elif ( | ||
op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" | ||
) and weight_or_bias == "weight": | ||
# Insert a tensor of 1x1xDxD bias. | ||
causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view( | ||
1, 1, n_positions, n_positions | ||
) | ||
output_state_dict[layer_name + ".attn.bias"] = causal_mask | ||
|
||
# Insert a "dummy" tensor for masked_bias. | ||
masked_bias = torch.tensor(-1e4, dtype=torch.float16) | ||
output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias | ||
|
||
out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) | ||
# Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D. | ||
out_val = out_val.transpose(0, 1).contiguous() | ||
# Store. | ||
output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val | ||
|
||
# Transpose the bias. | ||
elif ( | ||
op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" | ||
) and weight_or_bias == "bias": | ||
out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) | ||
# Store. No change of shape. | ||
output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val | ||
|
||
# Transpose the weights. | ||
elif weight_or_bias == "weight": | ||
out_name = megatron_to_transformers[op_name] | ||
output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1) | ||
|
||
# Copy the bias. | ||
elif weight_or_bias == "bias": | ||
out_name = megatron_to_transformers[op_name] | ||
output_state_dict[layer_name + out_name + "bias"] = val | ||
|
||
# DEBUG. | ||
assert config.n_layer == layer_idx + 1 | ||
|
||
# The final layernorm. | ||
output_state_dict["transformer.ln_f.weight"] = transformer["final_norm.weight"] | ||
output_state_dict["transformer.ln_f.bias"] = transformer["final_norm.bias"] | ||
|
||
# For LM head, transformers' wants the matrix to weight embeddings. | ||
output_state_dict["lm_head.weight"] = word_embeddings | ||
|
||
# It should be done! | ||
return output_state_dict | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--print-checkpoint-structure", action="store_true") | ||
parser.add_argument( | ||
"path_to_checkpoint", | ||
type=str, | ||
help="Path to the checkpoint file (.zip archive or direct .pt file)", | ||
) | ||
parser.add_argument( | ||
"output", | ||
type=str, | ||
help="Path to the pytorch model save", | ||
) | ||
parser.add_argument( | ||
"--config_file", | ||
default="", | ||
type=str, | ||
help="An optional config json file describing the pre-trained model.", | ||
) | ||
args = parser.parse_args() | ||
os.makedirs(args.output, exist_ok=True) | ||
|
||
# Extract the basename. | ||
basename = args.output | ||
|
||
# Load the model. | ||
# the .zip is very optional, let's keep it for backward compatibility | ||
print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}") | ||
if args.path_to_checkpoint.endswith(".zip"): | ||
with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint: | ||
with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict: | ||
input_state_dict = torch.load(pytorch_dict, map_location="cpu") | ||
else: | ||
input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu") | ||
|
||
ds_args = input_state_dict.get("args", None) | ||
|
||
# Read the config, or default to the model released by NVIDIA. | ||
if args.config_file == "": | ||
if ds_args is not None: | ||
if ds_args.bias_gelu_fusion: | ||
activation_function = "gelu_fast" | ||
elif ds_args.openai_gelu: | ||
activation_function = "gelu_new" | ||
else: | ||
activation_function = "gelu" | ||
else: | ||
# in the very early days this used to be "gelu_new" | ||
activation_function = "gelu_new" | ||
|
||
# Spell out all parameters in case the defaults change. | ||
config = GPT2Config( | ||
vocab_size=50257, | ||
n_positions=1024, | ||
n_embd=1024, | ||
n_layer=24, | ||
n_head=16, | ||
n_inner=4096, | ||
activation_function=activation_function, | ||
resid_pdrop=0.1, | ||
embd_pdrop=0.1, | ||
attn_pdrop=0.1, | ||
layer_norm_epsilon=1e-5, | ||
initializer_range=0.02, | ||
summary_type="cls_index", | ||
summary_use_proj=True, | ||
summary_activation=None, | ||
summary_proj_to_labels=True, | ||
summary_first_dropout=0.1, | ||
scale_attn_weights=True, | ||
use_cache=True, | ||
bos_token_id=50256, | ||
eos_token_id=50256, | ||
) | ||
else: | ||
config = GPT2Config.from_json_file(args.config_file) | ||
|
||
config.architectures = ["GPT2LMHeadModel"] | ||
|
||
# Convert. | ||
print("Converting...") | ||
|
||
output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) | ||
|
||
# Print the structure of converted state dict. | ||
if args.print_checkpoint_structure: | ||
recursive_print(None, output_state_dict) | ||
|
||
# Add tokenizer class info to config | ||
# see https://github.com/huggingface/transformers/issues/13906) | ||
if ds_args is not None: | ||
tokenizer_type = ds_args.tokenizer_type | ||
if tokenizer_type == "GPT2BPETokenizer": | ||
tokenizer_model_name = "openai-community/gpt2" | ||
elif tokenizer_type == "PretrainedFromHF": | ||
tokenizer_model_name = ds_args.tokenizer_name_or_path | ||
else: | ||
raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}") | ||
else: | ||
tokenizer_model_name = "openai-community/gpt2" | ||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name) | ||
tokenizer_class = type(tokenizer).__name__ | ||
config.tokenizer_class = tokenizer_class | ||
|
||
# Store the config to file. | ||
print("Saving config") | ||
config.save_pretrained(basename) | ||
|
||
# Save tokenizer based on args | ||
print(f"Adding {tokenizer_class} tokenizer files") | ||
tokenizer.save_pretrained(basename) | ||
|
||
# Store the state_dict to file. | ||
output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") | ||
print(f'Saving checkpoint to "{output_checkpoint_file}"') | ||
torch.save(output_state_dict, output_checkpoint_file) |