Given only the URL of a Web page, can we identify its topic? We study this problem in detail by exploring a large number of different feature sets and algorithms on several datasets. We also show that the inherent overlap between topics and the sparsity of the information in URLs makes this a very challenging problem. Web page classification without a page’s content is desirable when the content is not available at all, when a classification is needed before obtaining the content, or when classification speed is of utmost importance. For our experiments we used five different corpora comprising a total of about 3 million (URL, classification) pairs. We evaluated several techniques for feature generation and classification algorithms. The individual binary classifiers were then combined via boosting into metabinary classifiers. We achieve typical F-measure values between 80 and 85, and a typical precision of around 86. The precision can be pushed further over 90 while maintaining a typical level of recall between 30 and 40.
Beschreibung
A Comprehensive Study of Features and Algorithms for URL-Based Topic Classification | ACM Transactions on the Web
%0 Journal Article
%1 baykan2011comprehensive
%A Baykan, Eda
%A Henzinger, Monika
%A Marian, Ludmila
%A Weber, Ingmar
%D 2011
%I Association for Computing Machinery (ACM)
%J Transactions on the Web
%K classification link url web
%N 3
%P 1--29
%R 10.1145/1993053.1993057
%T A Comprehensive Study of Features and Algorithms for URL-Based Topic Classification
%U https://doi.org/10.1145%2F1993053.1993057
%V 5
%X Given only the URL of a Web page, can we identify its topic? We study this problem in detail by exploring a large number of different feature sets and algorithms on several datasets. We also show that the inherent overlap between topics and the sparsity of the information in URLs makes this a very challenging problem. Web page classification without a page’s content is desirable when the content is not available at all, when a classification is needed before obtaining the content, or when classification speed is of utmost importance. For our experiments we used five different corpora comprising a total of about 3 million (URL, classification) pairs. We evaluated several techniques for feature generation and classification algorithms. The individual binary classifiers were then combined via boosting into metabinary classifiers. We achieve typical F-measure values between 80 and 85, and a typical precision of around 86. The precision can be pushed further over 90 while maintaining a typical level of recall between 30 and 40.
@article{baykan2011comprehensive,
abstract = {Given only the URL of a Web page, can we identify its topic? We study this problem in detail by exploring a large number of different feature sets and algorithms on several datasets. We also show that the inherent overlap between topics and the sparsity of the information in URLs makes this a very challenging problem. Web page classification without a page’s content is desirable when the content is not available at all, when a classification is needed before obtaining the content, or when classification speed is of utmost importance. For our experiments we used five different corpora comprising a total of about 3 million (URL, classification) pairs. We evaluated several techniques for feature generation and classification algorithms. The individual binary classifiers were then combined via boosting into metabinary classifiers. We achieve typical F-measure values between 80 and 85, and a typical precision of around 86. The precision can be pushed further over 90 while maintaining a typical level of recall between 30 and 40.},
added-at = {2023-11-08T15:41:06.000+0100},
author = {Baykan, Eda and Henzinger, Monika and Marian, Ludmila and Weber, Ingmar},
biburl = {https://www.bibsonomy.org/bibtex/2383cc06c01b67cadc222039436907886/jaeschke},
description = {A Comprehensive Study of Features and Algorithms for URL-Based Topic Classification | ACM Transactions on the Web},
doi = {10.1145/1993053.1993057},
interhash = {47b728419cbb49db61f8bd4c2871f2ad},
intrahash = {383cc06c01b67cadc222039436907886},
journal = {Transactions on the Web},
keywords = {classification link url web},
month = jul,
number = 3,
pages = {1--29},
publisher = {Association for Computing Machinery ({ACM})},
timestamp = {2023-11-08T15:41:06.000+0100},
title = {A Comprehensive Study of Features and Algorithms for URL-Based Topic Classification},
url = {https://doi.org/10.1145%2F1993053.1993057},
volume = 5,
year = 2011
}