Autoregressive transformers are spectacular models for short sequences but
scale poorly to long sequences such as high-resolution images, podcasts, code,
or books. We proposed Megabyte, a multi-scale decoder architecture that enables
end-to-end differentiable modeling of sequences of over one million bytes.
Megabyte segments sequences into patches and uses a local submodel within
patches and a global model between patches. This enables sub-quadratic
self-attention, much larger feedforward layers for the same compute, and
improved parallelism during decoding -- unlocking better performance at reduced
cost for both training and generation. Extensive experiments show that Megabyte
allows byte-level models to perform competitively with subword models on long
context language modeling, achieve state-of-the-art density estimation on
ImageNet, and model audio from raw files. Together, these results establish the
viability of tokenization-free autoregressive sequence modeling at scale.
Описание
MEGABYTE: Predicting Million-byte Sequences with Multiscale Transformers
%0 Generic
%1 yu2023megabyte
%A Yu, Lili
%A Simig, Dániel
%A Flaherty, Colin
%A Aghajanyan, Armen
%A Zettlemoyer, Luke
%A Lewis, Mike
%D 2023
%K attention
%T MEGABYTE: Predicting Million-byte Sequences with Multiscale Transformers
%U http://arxiv.org/abs/2305.07185
%X Autoregressive transformers are spectacular models for short sequences but
scale poorly to long sequences such as high-resolution images, podcasts, code,
or books. We proposed Megabyte, a multi-scale decoder architecture that enables
end-to-end differentiable modeling of sequences of over one million bytes.
Megabyte segments sequences into patches and uses a local submodel within
patches and a global model between patches. This enables sub-quadratic
self-attention, much larger feedforward layers for the same compute, and
improved parallelism during decoding -- unlocking better performance at reduced
cost for both training and generation. Extensive experiments show that Megabyte
allows byte-level models to perform competitively with subword models on long
context language modeling, achieve state-of-the-art density estimation on
ImageNet, and model audio from raw files. Together, these results establish the
viability of tokenization-free autoregressive sequence modeling at scale.
@misc{yu2023megabyte,
abstract = {Autoregressive transformers are spectacular models for short sequences but
scale poorly to long sequences such as high-resolution images, podcasts, code,
or books. We proposed Megabyte, a multi-scale decoder architecture that enables
end-to-end differentiable modeling of sequences of over one million bytes.
Megabyte segments sequences into patches and uses a local submodel within
patches and a global model between patches. This enables sub-quadratic
self-attention, much larger feedforward layers for the same compute, and
improved parallelism during decoding -- unlocking better performance at reduced
cost for both training and generation. Extensive experiments show that Megabyte
allows byte-level models to perform competitively with subword models on long
context language modeling, achieve state-of-the-art density estimation on
ImageNet, and model audio from raw files. Together, these results establish the
viability of tokenization-free autoregressive sequence modeling at scale.},
added-at = {2023-08-02T19:56:22.000+0200},
author = {Yu, Lili and Simig, Dániel and Flaherty, Colin and Aghajanyan, Armen and Zettlemoyer, Luke and Lewis, Mike},
biburl = {https://www.bibsonomy.org/bibtex/279af4f4a98e9794441ebcf2f9b06d15a/vincentqb},
description = {MEGABYTE: Predicting Million-byte Sequences with Multiscale Transformers},
interhash = {4654768e8105a8e58e570e2a3739ed8e},
intrahash = {79af4f4a98e9794441ebcf2f9b06d15a},
keywords = {attention},
note = {cite arxiv:2305.07185},
timestamp = {2023-08-02T19:56:22.000+0200},
title = {MEGABYTE: Predicting Million-byte Sequences with Multiscale Transformers},
url = {http://arxiv.org/abs/2305.07185},
year = 2023
}