Keyphrase extraction models are usually evaluated under different, not directly comparable, experimental setups. As a result, it remains unclear how well proposed models actually perform, and how they compare to each other. In this work, we address this issue by presenting a systematic large-scale analysis of state-of-the-art keyphrase extraction models involving multiple benchmark datasets from various sources and domains. Our main results reveal that state-of-the-art models are in fact still challenged by simple baselines on some datasets. We also present new insights about the impact of using author- or reader-assigned keyphrases as a proxy for gold standard, and give recommendations for strong baselines and reliable benchmark datasets.
Description
Large-Scale Evaluation of Keyphrase Extraction Models | Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020
%0 Conference Paper
%1 Gallina_2020
%A Gallina, Ygor
%A Boudin, Florian
%A Daille, Béatrice
%B Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020
%D 2020
%I ACM
%K concept-extraction jcdl2020
%P 271-278
%R 10.1145/3383583.3398517
%T Large-Scale Evaluation of Keyphrase Extraction Models
%U https://doi.org/10.1145%2F3383583.3398517
%X Keyphrase extraction models are usually evaluated under different, not directly comparable, experimental setups. As a result, it remains unclear how well proposed models actually perform, and how they compare to each other. In this work, we address this issue by presenting a systematic large-scale analysis of state-of-the-art keyphrase extraction models involving multiple benchmark datasets from various sources and domains. Our main results reveal that state-of-the-art models are in fact still challenged by simple baselines on some datasets. We also present new insights about the impact of using author- or reader-assigned keyphrases as a proxy for gold standard, and give recommendations for strong baselines and reliable benchmark datasets.
@inproceedings{Gallina_2020,
abstract = {Keyphrase extraction models are usually evaluated under different, not directly comparable, experimental setups. As a result, it remains unclear how well proposed models actually perform, and how they compare to each other. In this work, we address this issue by presenting a systematic large-scale analysis of state-of-the-art keyphrase extraction models involving multiple benchmark datasets from various sources and domains. Our main results reveal that state-of-the-art models are in fact still challenged by simple baselines on some datasets. We also present new insights about the impact of using author- or reader-assigned keyphrases as a proxy for gold standard, and give recommendations for strong baselines and reliable benchmark datasets.
},
added-at = {2020-11-23T03:54:27.000+0100},
author = {Gallina, Ygor and Boudin, Florian and Daille, B{\'{e}}atrice},
biburl = {https://www.bibsonomy.org/bibtex/2fb73bc4ee8c305900cb4521a69a76abf/brusilovsky},
booktitle = {Proceedings of the {ACM}/{IEEE} Joint Conference on Digital Libraries in 2020},
description = {Large-Scale Evaluation of Keyphrase Extraction Models | Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020},
doi = {10.1145/3383583.3398517},
interhash = {70191b9219c2c681a73f1e33a4a18774},
intrahash = {fb73bc4ee8c305900cb4521a69a76abf},
keywords = {concept-extraction jcdl2020},
month = aug,
pages = {271-278},
publisher = {{ACM}},
timestamp = {2020-11-23T03:54:27.000+0100},
title = {Large-Scale Evaluation of Keyphrase Extraction Models},
url = {https://doi.org/10.1145%2F3383583.3398517},
year = 2020
}