X stands for experimental
A simple full-attention transformer, but also complete with a set of promising experimental features from various papers.
$ pip install x-transformers
import torch
from x_transformers import XTransformer
model = XTransformer(
num_tokens = 256,
dim = 512,
depth = 6,
heads = 8,
max_seq_len = 1024
)
src = torch.randint(0, 256, (1, 1024))
tgt = torch.randint(0, 256, (1, 1024))
model(src, tgt) # (1, 1024, 512)
@inproceedings{kitaev2020reformer,
title = {Reformer: The Efficient Transformer},
author = {Nikita Kitaev and Lukasz Kaiser and Anselm Levskaya},
booktitle = {International Conference on Learning Representations},
year = {2020},
url = {https://openreview.net/forum?id=rkgNKkHtvB}
}
@article{DBLP:journals/corr/abs-1907-01470,
author = {Sainbayar Sukhbaatar and
Edouard Grave and
Guillaume Lample and
Herv{\'{e}} J{\'{e}}gou and
Armand Joulin},
title = {Augmenting Self-attention with Persistent Memory},
journal = {CoRR},
volume = {abs/1907.01470},
year = {2019},
url = {http://arxiv.org/abs/1907.01470}
}
@article{1910.05895,
author = {Toan Q. Nguyen and Julian Salazar},
title = {Transformers without Tears: Improving the Normalization of Self-Attention},
year = {2019},
eprint = {arXiv:1910.05895},
doi = {10.5281/zenodo.3525484},
}
@misc{shazeer2020glu,
title = {GLU Variants Improve Transformer},
author = {Noam Shazeer},
year = {2020},
url = {https://arxiv.org/abs/2002.05202}
}
@misc{bachlechner2020rezero,
title = {ReZero is All You Need: Fast Convergence at Large Depth},
author = {Thomas Bachlechner and Bodhisattwa Prasad Majumder and Huanru Henry Mao and Garrison W. Cottrell and Julian McAuley},
year = {2020},
url = {https://arxiv.org/abs/2003.04887}
}
@misc{bhojanapalli2020lowrank,
title = {Low-Rank Bottleneck in Multi-head Attention Models},
author = {Srinadh Bhojanapalli and Chulhee Yun and Ankit Singh Rawat and Sashank J. Reddi and Sanjiv Kumar},
year = {2020},
eprint = {2002.07028}
}
@misc{burtsev2020memory,
title = {Memory Transformer},
author = {Mikhail S. Burtsev and Grigory V. Sapunov},
year = {2020},
eprint = {2006.11527},
archivePrefix = {arXiv},
primaryClass = {cs.CL}
}
@misc{correia2019adaptively,
title = {Adaptively Sparse Transformers},
author = {Gonçalo M. Correia and Vlad Niculae and André F. T. Martins},
year = {2019},
eprint = {1909.00015},
archivePrefix = {arXiv},
primaryClass = {cs.CL}
}
@misc{shazeer2020talkingheads,
title = {Talking-Heads Attention},
author = {Noam Shazeer and Zhenzhong Lan and Youlong Cheng and Nan Ding and Le Hou},
year = {2020},
eprint = {2003.02436},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
}
@misc{press2020improving,
title = {Improving Transformer Models by Reordering their Sublayers},
author = {Ofir Press and Noah A. Smith and Omer Levy},
year = {2020},
eprint = {1911.03864},
archivePrefix = {arXiv},
primaryClass = {cs.CL}
}
@misc{lu2019understanding,
title = {Understanding and Improving Transformer From a Multi-Particle Dynamic System Point of View},
author = {Yiping Lu and Zhuohan Li and Di He and Zhiqing Sun and Bin Dong and Tao Qin and Liwei Wang and Tie-Yan Liu},
year = {2019},
eprint = {1906.02762},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
}
@misc{zhang2020selfattention,
title = {When Can Self-Attention Be Replaced by Feed Forward Layers?},
author = {Shucong Zhang and Erfan Loweimi and Peter Bell and Steve Renals},
year = {2020},
eprint = {2005.13895},
archivePrefix = {arXiv},
primaryClass = {eess.AS}
}
@misc{ke2020rethinking,
title = {Rethinking Positional Encoding in Language Pre-training},
author = {Guolin Ke and Di He and Tie-Yan Liu},
year = {2020},
eprint = {2006.15595},
archivePrefix = {arXiv},
primaryClass = {cs.CL}
}
@misc{dai2020funneltransformer,
title = {Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing},
author = {Zihang Dai and Guokun Lai and Yiming Yang and Quoc V. Le},
year = {2020},
eprint = {2006.03236},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
}