One of the most common operations in analytic query processing is the application of an aggregate function to the result of a relational join. We describe an algorithm called the Sort-Merge-Shrink (SMS) Join for computing the answer to such a query over large, disk-based input tables. The key innovation of the SMS join is that if the input data are clustered in a statistically random fashion on disk, then at all times, the join provides an online, statistical estimator for the eventual answer to the query as well as probabilistic confidence bounds. Thus, a user can monitor the progress of the join throughout its execution and stop the join when satisfied with the estimate's accuracy or run the algorithm to completion with a total time requirement that is not much longer than that of other common join algorithms. This contrasts with other online join algorithms, which either do not offer such statistical guarantees or can only offer guarantees so long as the input data can fit into main memory.
%0 Journal Article
%1 1189775
%A Jermaine, Christopher
%A Dobra, Alin
%A Arumugam, Subramanian
%A Joshi, Shantanu
%A Pol, Abhijit
%C New York, NY, USA
%D 2006
%I ACM
%J ACM Trans. Database Syst.
%K database join olap reporting retrieval sampling
%N 4
%P 1382--1416
%R http://doi.acm.org/10.1145/1189769.1189775
%T The Sort-Merge-Shrink join
%U http://portal.acm.org/citation.cfm?id=1189769.1189775
%V 31
%X One of the most common operations in analytic query processing is the application of an aggregate function to the result of a relational join. We describe an algorithm called the Sort-Merge-Shrink (SMS) Join for computing the answer to such a query over large, disk-based input tables. The key innovation of the SMS join is that if the input data are clustered in a statistically random fashion on disk, then at all times, the join provides an online, statistical estimator for the eventual answer to the query as well as probabilistic confidence bounds. Thus, a user can monitor the progress of the join throughout its execution and stop the join when satisfied with the estimate's accuracy or run the algorithm to completion with a total time requirement that is not much longer than that of other common join algorithms. This contrasts with other online join algorithms, which either do not offer such statistical guarantees or can only offer guarantees so long as the input data can fit into main memory.
@article{1189775,
abstract = {One of the most common operations in analytic query processing is the application of an aggregate function to the result of a relational join. We describe an algorithm called the Sort-Merge-Shrink (SMS) Join for computing the answer to such a query over large, disk-based input tables. The key innovation of the SMS join is that if the input data are clustered in a statistically random fashion on disk, then at all times, the join provides an online, statistical estimator for the eventual answer to the query as well as probabilistic confidence bounds. Thus, a user can monitor the progress of the join throughout its execution and stop the join when satisfied with the estimate's accuracy or run the algorithm to completion with a total time requirement that is not much longer than that of other common join algorithms. This contrasts with other online join algorithms, which either do not offer such statistical guarantees or can only offer guarantees so long as the input data can fit into main memory.},
added-at = {2007-12-18T00:47:03.000+0100},
address = {New York, NY, USA},
author = {Jermaine, Christopher and Dobra, Alin and Arumugam, Subramanian and Joshi, Shantanu and Pol, Abhijit},
biburl = {https://www.bibsonomy.org/bibtex/27bfac50eac877b9ac12425cbf067d7b3/jhammerb},
description = {The Sort-Merge-Shrink join},
doi = {http://doi.acm.org/10.1145/1189769.1189775},
interhash = {8968fde6bf50c0edf247684f622ee30e},
intrahash = {7bfac50eac877b9ac12425cbf067d7b3},
issn = {0362-5915},
journal = {ACM Trans. Database Syst.},
keywords = {database join olap reporting retrieval sampling},
number = 4,
pages = {1382--1416},
publisher = {ACM},
timestamp = {2007-12-18T00:47:03.000+0100},
title = {The Sort-Merge-Shrink join},
url = {http://portal.acm.org/citation.cfm?id=1189769.1189775},
volume = 31,
year = 2006
}