Two-dimensional contingency or co-occurrence tables arise frequently in important applications such as text, web-log and market-basket data analysis. A basic problem in contingency table analysis is co-clustering: simultaneous clustering of the rows and columns. A novel theoretical formulation views the contingency table as an empirical joint probability distribution of two discrete random variables and poses the co-clustering problem as an optimization problem in information theory---the optimal co-clustering maximizes the mutual information between the clustered random variables subject to constraints on the number of row and column clusters. We present an innovative co-clustering algorithm that monotonically increases the preserved mutual information by intertwining both the row and column clusterings at all stages. Using the practical example of simultaneous word-document clustering, we demonstrate that our algorithm works well in practice, especially in the presence of sparsity and high-dimensionality.
%0 Conference Paper
%1 Dhillon03InformationCoclustering
%A Dhillon, Inderjit S.
%A Mallela, Subramanyam
%A Modha, Dharmendra S.
%B KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining
%C New York, NY, USA
%D 2003
%I ACM
%K 03 Dhillon clustering coclustering information text theory
%P 89--98
%R http://doi.acm.org/10.1145/956750.956764
%T Information-theoretic co-clustering
%U http://portal.acm.org/citation.cfm?id=956764
%X Two-dimensional contingency or co-occurrence tables arise frequently in important applications such as text, web-log and market-basket data analysis. A basic problem in contingency table analysis is co-clustering: simultaneous clustering of the rows and columns. A novel theoretical formulation views the contingency table as an empirical joint probability distribution of two discrete random variables and poses the co-clustering problem as an optimization problem in information theory---the optimal co-clustering maximizes the mutual information between the clustered random variables subject to constraints on the number of row and column clusters. We present an innovative co-clustering algorithm that monotonically increases the preserved mutual information by intertwining both the row and column clusterings at all stages. Using the practical example of simultaneous word-document clustering, we demonstrate that our algorithm works well in practice, especially in the presence of sparsity and high-dimensionality.
%@ 1-58113-737-0
@inproceedings{Dhillon03InformationCoclustering,
abstract = {Two-dimensional contingency or co-occurrence tables arise frequently in important applications such as text, web-log and market-basket data analysis. A basic problem in contingency table analysis is co-clustering: simultaneous clustering of the rows and columns. A novel theoretical formulation views the contingency table as an empirical joint probability distribution of two discrete random variables and poses the co-clustering problem as an optimization problem in information theory---the optimal co-clustering maximizes the mutual information between the clustered random variables subject to constraints on the number of row and column clusters. We present an innovative co-clustering algorithm that monotonically increases the preserved mutual information by intertwining both the row and column clusterings at all stages. Using the practical example of simultaneous word-document clustering, we demonstrate that our algorithm works well in practice, especially in the presence of sparsity and high-dimensionality.},
added-at = {2008-11-13T19:18:30.000+0100},
address = {New York, NY, USA},
author = {Dhillon, Inderjit S. and Mallela, Subramanyam and Modha, Dharmendra S.},
biburl = {https://www.bibsonomy.org/bibtex/2108f77a4e13823f24497340ff1cba0ad/lee_peck},
booktitle = {KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining},
description = {Information-theoretic co-clustering},
doi = {http://doi.acm.org/10.1145/956750.956764},
interhash = {30fe4c22011ee3c5565d35709d9ce1f1},
intrahash = {108f77a4e13823f24497340ff1cba0ad},
isbn = {1-58113-737-0},
keywords = {03 Dhillon clustering coclustering information text theory},
location = {Washington, D.C.},
pages = {89--98},
publisher = {ACM},
timestamp = {2009-02-02T16:40:19.000+0100},
title = {Information-theoretic co-clustering},
url = {http://portal.acm.org/citation.cfm?id=956764},
year = 2003
}