Web information is ephemeral. Several organizations around the world are struggling to archive information from the web before it vanishes. However, users demand efficient and effective search mechanisms to access the already vast collections of historical information held by web archives. The Portuguese Web Archive is the largest full-text searchable web archive publicly available. It supports search over 1.2 billion files archived from the web since 1996. This study contributes with an overview of the lessons learned while developing the Portuguese Web Archive, focusing on web data acquisition, ranking search results and user interface design. The developed software is freely available as an open source project. We believe that sharing our experience obtained while developing and operating a running service will enable other organizations to start or improve their web archives.
%0 Conference Paper
%1 gomes2013creating
%A Gomes, Daniel
%A Costa, Miguel
%A Cruz, David
%A Miranda, Jo\ ao
%A Fontes, Sim\ ao
%B Proceedings of the 22Nd International Conference on World Wide Web
%C New York, NY, USA
%D 2013
%I ACM
%K archive search web
%P 1059--1066
%R 10.1145/2487788.2488118
%T Creating a Billion-scale Searchable Web Archive
%U http://doi.acm.org/10.1145/2487788.2488118
%X Web information is ephemeral. Several organizations around the world are struggling to archive information from the web before it vanishes. However, users demand efficient and effective search mechanisms to access the already vast collections of historical information held by web archives. The Portuguese Web Archive is the largest full-text searchable web archive publicly available. It supports search over 1.2 billion files archived from the web since 1996. This study contributes with an overview of the lessons learned while developing the Portuguese Web Archive, focusing on web data acquisition, ranking search results and user interface design. The developed software is freely available as an open source project. We believe that sharing our experience obtained while developing and operating a running service will enable other organizations to start or improve their web archives.
%@ 978-1-4503-2038-2
@inproceedings{gomes2013creating,
abstract = {Web information is ephemeral. Several organizations around the world are struggling to archive information from the web before it vanishes. However, users demand efficient and effective search mechanisms to access the already vast collections of historical information held by web archives. The Portuguese Web Archive is the largest full-text searchable web archive publicly available. It supports search over 1.2 billion files archived from the web since 1996. This study contributes with an overview of the lessons learned while developing the Portuguese Web Archive, focusing on web data acquisition, ranking search results and user interface design. The developed software is freely available as an open source project. We believe that sharing our experience obtained while developing and operating a running service will enable other organizations to start or improve their web archives.},
acmid = {2488118},
added-at = {2016-11-03T12:58:04.000+0100},
address = {New York, NY, USA},
author = {Gomes, Daniel and Costa, Miguel and Cruz, David and Miranda, Jo\ {a}o and Fontes, Sim\ {a}o},
biburl = {https://www.bibsonomy.org/bibtex/226910b6bb47f14ea0d17dd8095af9c1f/jaeschke},
booktitle = {Proceedings of the 22Nd International Conference on World Wide Web},
doi = {10.1145/2487788.2488118},
interhash = {cb33c5bc71a052e3b175906767706853},
intrahash = {26910b6bb47f14ea0d17dd8095af9c1f},
isbn = {978-1-4503-2038-2},
keywords = {archive search web},
location = {Rio de Janeiro, Brazil},
numpages = {8},
pages = {1059--1066},
publisher = {ACM},
series = {WWW '13 Companion},
timestamp = {2016-11-03T12:58:04.000+0100},
title = {Creating a Billion-scale Searchable Web Archive},
url = {http://doi.acm.org/10.1145/2487788.2488118},
year = 2013
}