Skip to content

Commit

Permalink
improve benchmarks on NLP
Browse files Browse the repository at this point in the history
  • Loading branch information
WwZzz committed Jul 13, 2023
1 parent 09a35a8 commit 756aecb
Show file tree
Hide file tree
Showing 24 changed files with 382 additions and 396 deletions.
21 changes: 20 additions & 1 deletion flgo/benchmark/agnews_classification/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,26 @@
test_data = torchtext.datasets.AG_NEWS(root=path, split='test')
ngrams = 2
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter, ngrams):
for _, text in data_iter:
yield ngrams_iterator(tokenizer(text), ngrams)

vocab = build_vocab_from_iterator(yield_tokens(train_data, ngrams), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

def text_pipeline(x):
return vocab(list(ngrams_iterator(tokenizer(x), ngrams)))

def label_pipeline(x):
return int(x) - 1

def apply_transform(x):
return text_pipeline(x[1]), label_pipeline(x[0])

train_data = train_data.map(apply_transform)
test_data = test_data.map(apply_transform)

class TextClassificationModel(torch.nn.Module):
def __init__(self, vocab_size, embed_dim, num_class):
super(TextClassificationModel, self).__init__()
Expand All @@ -30,7 +43,13 @@ def init_weights(self):
self.fc.weight.data.uniform_(-initrange, initrange)
self.fc.bias.data.zero_()

def forward(self, text, offsets):
def forward(self, text):
offsets = [0]
for t in text:
offsets.append(t.size(0))
offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
text = torch.cat(text)
offsets = offsets.to(text.device)
embedded = self.embedding(text, offsets)
return self.fc(embedded)

Expand Down
41 changes: 7 additions & 34 deletions flgo/benchmark/agnews_classification/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,13 @@
import torch.utils.data
from flgo.benchmark.toolkits.nlp.classification import GeneralCalculator
from flgo.benchmark.base import FromDatasetPipe, FromDatasetGenerator
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.data.utils import ngrams_iterator
from torchtext.data.functional import to_map_style_dataset
try:
import ujson as json
except:
import json
from .config import train_data
try:
from .config import tokenizer
except:
tokenizer = None
try:
from .config import ngrams
except:
ngrams = 1

def yield_tokens(data_iter, ngrams):
for _, text in data_iter:
yield ngrams_iterator(tokenizer(text), ngrams)

try:
from .config import vocab
except:
vocab = None
try:
from .config import test_data
except:
Expand All @@ -37,30 +19,21 @@ def yield_tokens(data_iter, ngrams):
except:
val_data = None

if tokenizer is None: tokenizer = get_tokenizer('basic_english')
if vocab is None:
vocab = build_vocab_from_iterator(yield_tokens(train_data, ngrams), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

def collate_batch(batch):
label_list, text_list, offsets = [], [], [0]
for (_label, _text) in batch:
label_list.append(int(_label)-1)
processed_text = torch.tensor(vocab(list(ngrams_iterator(tokenizer(_text), ngrams))), dtype=torch.int64)
label_list, text_list = [], []
for (_text, _label) in batch:
label_list.append(_label)
processed_text = torch.tensor(_text, dtype=torch.int64)
text_list.append(processed_text)
offsets.append(processed_text.size(0))
label_list = torch.tensor(label_list, dtype=torch.int64)
offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
text_list = torch.cat(text_list)
return label_list, text_list, offsets
return text_list, label_list

class TaskGenerator(FromDatasetGenerator):
def __init__(self):
super(TaskGenerator, self).__init__(benchmark=os.path.split(os.path.dirname(__file__))[-1],
train_data=train_data, val_data=val_data, test_data=test_data)

def prepare_data_for_partition(self):
self.train_data = self.train_data.map(lambda x: (x[1], x[0]))
return to_map_style_dataset(self.train_data)

class TaskPipe(FromDatasetPipe):
Expand All @@ -70,7 +43,7 @@ def __init__(self, task_path):

def save_task(self, generator):
client_names = self.gen_client_names(len(generator.local_datas))
feddata = {'client_names': client_names,}
feddata = {'client_names': client_names}
for cid in range(len(client_names)): feddata[client_names[cid]] = {'data': generator.local_datas[cid],}
with open(os.path.join(self.task_path, 'data.json'), 'w') as outf:
json.dump(feddata, outf)
Expand Down
Empty file.
59 changes: 0 additions & 59 deletions flgo/benchmark/agnews_classification/model/bag_linear.py

This file was deleted.

20 changes: 19 additions & 1 deletion flgo/benchmark/imdb_classification/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@ def yield_tokens(data_iter, ngrams):
vocab = build_vocab_from_iterator(yield_tokens(train_data, ngrams), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

def text_pipeline(x):
return vocab(list(ngrams_iterator(tokenizer(x), ngrams)))

def label_pipeline(x):
return int(x) - 1

def apply_transform(x):
return text_pipeline(x[1]), label_pipeline(x[0])

train_data = train_data.map(apply_transform)
test_data = test_data.map(apply_transform)

class TextClassificationModel(torch.nn.Module):
def __init__(self, vocab_size, embed_dim, num_class):
super(TextClassificationModel, self).__init__()
Expand All @@ -30,7 +42,13 @@ def init_weights(self):
self.fc.weight.data.uniform_(-initrange, initrange)
self.fc.bias.data.zero_()

def forward(self, text, offsets):
def forward(self, text):
offsets = [0]
for t in text:
offsets.append(t.size(0))
offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
text = torch.cat(text)
offsets = offsets.to(text.device)
embedded = self.embedding(text, offsets)
return self.fc(embedded)

Expand Down
41 changes: 7 additions & 34 deletions flgo/benchmark/imdb_classification/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,13 @@
import torch.utils.data
from flgo.benchmark.toolkits.nlp.classification import GeneralCalculator
from flgo.benchmark.base import FromDatasetPipe, FromDatasetGenerator
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.data.functional import to_map_style_dataset

try:
import ujson as json
except:
import json
from .config import train_data
try:
from .config import tokenizer
except:
tokenizer = None
try:
from .config import ngrams
except:
ngrams = 1

def yield_tokens(data_iter, ngrams):
for _, text in data_iter:
yield ngrams_iterator(tokenizer(text), ngrams)

try:
from .config import vocab
except:
vocab = None
try:
from .config import test_data
except:
Expand All @@ -37,30 +19,21 @@ def yield_tokens(data_iter, ngrams):
except:
val_data = None

if tokenizer is None: tokenizer = get_tokenizer('basic_english')
if vocab is None:
vocab = build_vocab_from_iterator(yield_tokens(train_data, ngrams), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

def collate_batch(batch):
label_list, text_list, offsets = [], [], [0]
for (_label, _text) in batch:
label_list.append(int(_label)-1)
processed_text = torch.tensor(vocab(list(ngrams_iterator(tokenizer(_text), ngrams))), dtype=torch.int64)
label_list, text_list = [], []
for (_text, _label) in batch:
label_list.append(_label)
processed_text = torch.tensor(_text, dtype=torch.int64)
text_list.append(processed_text)
offsets.append(processed_text.size(0))
label_list = torch.tensor(label_list, dtype=torch.int64)
offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
text_list = torch.cat(text_list)
return label_list, text_list, offsets
return text_list, label_list

class TaskGenerator(FromDatasetGenerator):
def __init__(self):
super(TaskGenerator, self).__init__(benchmark=os.path.split(os.path.dirname(__file__))[-1],
train_data=train_data, val_data=val_data, test_data=test_data)

def prepare_data_for_partition(self):
self.train_data = self.train_data.map(lambda x: (x[1], x[0]))
return to_map_style_dataset(self.train_data)

class TaskPipe(FromDatasetPipe):
Expand All @@ -70,7 +43,7 @@ def __init__(self, task_path):

def save_task(self, generator):
client_names = self.gen_client_names(len(generator.local_datas))
feddata = {'client_names': client_names,}
feddata = {'client_names': client_names}
for cid in range(len(client_names)): feddata[client_names[cid]] = {'data': generator.local_datas[cid],}
with open(os.path.join(self.task_path, 'data.json'), 'w') as outf:
json.dump(feddata, outf)
Expand Down
45 changes: 37 additions & 8 deletions flgo/benchmark/multi30k_translation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,21 @@
import torch.nn.functional as F
import random

# 0. 加载数据集
language_pair = ['de', 'en']
path = os.path.join(flgo.benchmark.path, 'RAW_DATA', 'MULTI30K')
train_data, val_data, test_data = Multi30k(split=('train', 'valid', 'test'), language_pair=language_pair)

# 1. 加载tokenizer和词表
# init tokenizers
tokenizers = {}
tokenizers[language_pair[0]] = get_tokenizer('spacy', language='de_core_news_sm')
tokenizers[language_pair[1]] = get_tokenizer('spacy', language='en_core_web_sm')

# init vocabs
vocabs = {}
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']
PAD_IDX, UNK_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<pad>', '<unk>', '<bos>', '<eos>']
for i,ln in enumerate(language_pair):
# Create torchtext's Vocab object
tokenizer = tokenizers[ln]
Expand All @@ -31,13 +33,33 @@
special_first=True)
for ln in language_pair: vocabs[ln].set_default_index(UNK_IDX)

def init_weights(m):
for name, param in m.named_parameters():
if 'weight' in name:
nn.init.normal_(param.data, mean=0, std=0.01)
else:
nn.init.constant_(param.data, 0)
# 2. 把数据集中的字符串(源语言和目标语言),根据tokenizer和vocab转化为数值向量
def sequential_transforms(*transforms):
def func(txt_input):
for transform in transforms:
txt_input = transform(txt_input)
return txt_input
return func

def tensor_transform(token_ids):
return torch.cat((torch.tensor([BOS_IDX]),
torch.tensor(token_ids),
torch.tensor([EOS_IDX])))

text_transform = {}
for ln in language_pair:
text_transform[ln] = sequential_transforms(tokenizers[ln], #Tokenization
vocabs[ln], #Numericalization
tensor_transform) # Add BOS/EOS and create tensor

def apply_transform(x):
return text_transform[language_pair[0]](x[0].rstrip("\n")), text_transform[language_pair[1]](x[1].rstrip("\n"))

train_data = train_data.map(apply_transform)
val_data = val_data.map(apply_transform)
test_data = test_data.map(apply_transform)

# 3. 定义模型
def get_model():
INPUT_DIM = len(vocabs[language_pair[0]])
OUTPUT_DIM = len(vocabs[language_pair[1]])
Expand All @@ -54,6 +76,13 @@ def get_model():
model.apply(init_weights)
return model

def init_weights(m):
for name, param in m.named_parameters():
if 'weight' in name:
nn.init.normal_(param.data, mean=0, std=0.01)
else:
nn.init.constant_(param.data, 0)

class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
super().__init__()
Expand Down
Loading

0 comments on commit 756aecb

Please sign in to comment.