Hyperlinks and other relations in Wikipedia are a extraordinary resource
which is still not fully understood. In this paper we study the different types
of links in Wikipedia, and contrast the use of the full graph with respect to
just direct links. We apply a well-known random walk algorithm on two tasks,
word relatedness and named-entity disambiguation. We show that using the full
graph is more effective than just direct links by a large margin, that
non-reciprocal links harm performance, and that there is no benefit from
categories and infoboxes, with coherent results on both tasks. We set new
state-of-the-art figures for systems based on Wikipedia links, comparable to
systems exploiting several information sources and/or supervised machine
learning. Our approach is open source, with instruction to reproduce results,
and amenable to be integrated with complementary text-based methods.
Description
Studying the Wikipedia Hyperlink Graph for Relatedness and
Disambiguation
%0 Generic
%1 agirre2015studying
%A Agirre, Eneko
%A Barrena, Ander
%A Soroa, Aitor
%D 2015
%K disambiguation link relatedness wikipedia
%T Studying the Wikipedia Hyperlink Graph for Relatedness and Disambiguation.
%U http://arxiv.org/abs/1503.01655
%X Hyperlinks and other relations in Wikipedia are a extraordinary resource
which is still not fully understood. In this paper we study the different types
of links in Wikipedia, and contrast the use of the full graph with respect to
just direct links. We apply a well-known random walk algorithm on two tasks,
word relatedness and named-entity disambiguation. We show that using the full
graph is more effective than just direct links by a large margin, that
non-reciprocal links harm performance, and that there is no benefit from
categories and infoboxes, with coherent results on both tasks. We set new
state-of-the-art figures for systems based on Wikipedia links, comparable to
systems exploiting several information sources and/or supervised machine
learning. Our approach is open source, with instruction to reproduce results,
and amenable to be integrated with complementary text-based methods.
@misc{agirre2015studying,
abstract = {Hyperlinks and other relations in Wikipedia are a extraordinary resource
which is still not fully understood. In this paper we study the different types
of links in Wikipedia, and contrast the use of the full graph with respect to
just direct links. We apply a well-known random walk algorithm on two tasks,
word relatedness and named-entity disambiguation. We show that using the full
graph is more effective than just direct links by a large margin, that
non-reciprocal links harm performance, and that there is no benefit from
categories and infoboxes, with coherent results on both tasks. We set new
state-of-the-art figures for systems based on Wikipedia links, comparable to
systems exploiting several information sources and/or supervised machine
learning. Our approach is open source, with instruction to reproduce results,
and amenable to be integrated with complementary text-based methods.},
added-at = {2017-12-17T12:18:47.000+0100},
author = {Agirre, Eneko and Barrena, Ander and Soroa, Aitor},
biburl = {https://www.bibsonomy.org/bibtex/2656a349fd45e2b18b951bb2a8a4f4304/thoni},
description = {Studying the Wikipedia Hyperlink Graph for Relatedness and
Disambiguation},
interhash = {eb3ae939d9460b051448f5200c511363},
intrahash = {656a349fd45e2b18b951bb2a8a4f4304},
keywords = {disambiguation link relatedness wikipedia},
note = {cite arxiv:1503.01655},
timestamp = {2017-12-17T12:18:47.000+0100},
title = {Studying the Wikipedia Hyperlink Graph for Relatedness and Disambiguation.},
url = {http://arxiv.org/abs/1503.01655},
year = 2015
}