We consider the problem of finding duplicates in data streams. Duplicate detection in data streams is utilized in various applications including fraud detection. We develop a solution based on Bloom Filters 9, and discuss the space and time requirements for running the proposed algorithm in both the contexts of sliding, and landmark stream windows. We run a comprehensive set of experiments, using both real and synthetic click streams, to evaluate the performance of the proposed solution. The results demonstrate that the proposed solution yields extremely low error rates.
%0 Conference Paper
%1 1060753
%A Metwally, Ahmed
%A Agrawal, Divyakant
%A Abbadi, Amr El
%B WWW '05: Proceedings of the 14th international conference on World Wide Web
%C New York, NY, USA
%D 2005
%I ACM
%K algorithms bloomfilter data_structures datamining duplicate
%P 12--21
%R http://doi.acm.org/10.1145/1060745.1060753
%T Duplicate detection in click streams
%U http://portal.acm.org/citation.cfm?id=1060745.1060753
%X We consider the problem of finding duplicates in data streams. Duplicate detection in data streams is utilized in various applications including fraud detection. We develop a solution based on Bloom Filters 9, and discuss the space and time requirements for running the proposed algorithm in both the contexts of sliding, and landmark stream windows. We run a comprehensive set of experiments, using both real and synthetic click streams, to evaluate the performance of the proposed solution. The results demonstrate that the proposed solution yields extremely low error rates.
%@ 1-59593-046-9
@inproceedings{1060753,
abstract = {We consider the problem of finding duplicates in data streams. Duplicate detection in data streams is utilized in various applications including fraud detection. We develop a solution based on Bloom Filters [9], and discuss the space and time requirements for running the proposed algorithm in both the contexts of sliding, and landmark stream windows. We run a comprehensive set of experiments, using both real and synthetic click streams, to evaluate the performance of the proposed solution. The results demonstrate that the proposed solution yields extremely low error rates.},
added-at = {2007-12-17T01:03:12.000+0100},
address = {New York, NY, USA},
author = {Metwally, Ahmed and Agrawal, Divyakant and Abbadi, Amr El},
biburl = {https://www.bibsonomy.org/bibtex/2f6b44ae67e9d3960e3a1bb7fe630ea5f/jhammerb},
booktitle = {WWW '05: Proceedings of the 14th international conference on World Wide Web},
description = {Duplicate detection in click streams},
doi = {http://doi.acm.org/10.1145/1060745.1060753},
interhash = {c82d08591d37549fce80c4697ef579fb},
intrahash = {f6b44ae67e9d3960e3a1bb7fe630ea5f},
isbn = {1-59593-046-9},
keywords = {algorithms bloomfilter data_structures datamining duplicate},
location = {Chiba, Japan},
pages = {12--21},
publisher = {ACM},
timestamp = {2007-12-17T01:03:12.000+0100},
title = {Duplicate detection in click streams},
url = {http://portal.acm.org/citation.cfm?id=1060745.1060753},
year = 2005
}