We present the basic principles that underlie the high-performance implementation of the matrix-matrix multiplication that is part of the widely used GotoBLAS library. Design decisions are justified by successively refining a model of architectures with multilevel memories. A simple but effective algorithm for executing this operation results. Implementations on a broad selection of architectures are shown to achieve near-peak performance.
%0 Journal Article
%1 Goto:2008:AHM:1356052.1356053
%A Goto, Kazushige
%A Geijn, Robert A. van de
%C New York, NY, USA
%D 2008
%I ACM
%J ACM Trans. Math. Softw.
%K 2008 acm algorithms article linear-algebra matrix paper reference
%N 3
%P 12:1--12:25
%R 10.1145/1356052.1356053
%T Anatomy of High-performance Matrix Multiplication
%U http://doi.acm.org/10.1145/1356052.1356053
%V 34
%X We present the basic principles that underlie the high-performance implementation of the matrix-matrix multiplication that is part of the widely used GotoBLAS library. Design decisions are justified by successively refining a model of architectures with multilevel memories. A simple but effective algorithm for executing this operation results. Implementations on a broad selection of architectures are shown to achieve near-peak performance.
@article{Goto:2008:AHM:1356052.1356053,
abstract = {We present the basic principles that underlie the high-performance implementation of the matrix-matrix multiplication that is part of the widely used GotoBLAS library. Design decisions are justified by successively refining a model of architectures with multilevel memories. A simple but effective algorithm for executing this operation results. Implementations on a broad selection of architectures are shown to achieve near-peak performance.},
acmid = {1356053},
added-at = {2018-05-29T06:04:39.000+0200},
address = {New York, NY, USA},
articleno = {12},
author = {Goto, Kazushige and Geijn, Robert A. van de},
biburl = {https://www.bibsonomy.org/bibtex/2b72c6525e5a3f849e98fa98201dfb2a7/achakraborty},
description = {Anatomy of high-performance matrix multiplication},
doi = {10.1145/1356052.1356053},
interhash = {15a295c2c8e2c9fd37ccb9f3d9aa40be},
intrahash = {b72c6525e5a3f849e98fa98201dfb2a7},
issn = {0098-3500},
issue_date = {May 2008},
journal = {ACM Trans. Math. Softw.},
keywords = {2008 acm algorithms article linear-algebra matrix paper reference},
month = may,
number = 3,
numpages = {25},
pages = {12:1--12:25},
publisher = {ACM},
timestamp = {2018-05-29T06:04:39.000+0200},
title = {Anatomy of High-performance Matrix Multiplication},
url = {http://doi.acm.org/10.1145/1356052.1356053},
volume = 34,
year = 2008
}