Natural languages are full of rules and exceptions. One of the most famous quantitative rules is Zipf's law, which states that the frequency of occurrence of a word is approximately inversely proportional to its rank. Though this “law” of ranks has been found to hold across disparate texts and forms of data, analyses of increasingly large corpora since the late 1990s have revealed the existence of two scaling regimes. These regimes have thus far been explained by a hypothesis suggesting a separability of languages into core and noncore lexica. Here we present and defend an alternative hypothesis that the two scaling regimes result from the act of aggregating texts. We observe that text mixing leads to an effective decay of word introduction, which we show provides accurate predictions of the location and severity of breaks in scaling. Upon examining large corpora from 10 languages in the Project Gutenberg eBooks collection, we find emphatic empirical support for the universality of our claim.
Description
Phys. Rev. E 91, 052811 (2015) - Text mixing shapes the anatomy of rank-frequency distributions
%0 Journal Article
%1 Williams2015
%A Williams, Jake Ryland
%A Bagrow, James P.
%A Danforth, Christopher M.
%A Dodds, Peter Sheridan
%D 2015
%I American Physical Society
%J Phys. Rev. E
%K mybook texts zipf
%N 5
%P 052811
%R 10.1103/PhysRevE.91.052811
%T Text mixing shapes the anatomy of rank-frequency distributions
%U https://link.aps.org/doi/10.1103/PhysRevE.91.052811
%V 91
%X Natural languages are full of rules and exceptions. One of the most famous quantitative rules is Zipf's law, which states that the frequency of occurrence of a word is approximately inversely proportional to its rank. Though this “law” of ranks has been found to hold across disparate texts and forms of data, analyses of increasingly large corpora since the late 1990s have revealed the existence of two scaling regimes. These regimes have thus far been explained by a hypothesis suggesting a separability of languages into core and noncore lexica. Here we present and defend an alternative hypothesis that the two scaling regimes result from the act of aggregating texts. We observe that text mixing leads to an effective decay of word introduction, which we show provides accurate predictions of the location and severity of breaks in scaling. Upon examining large corpora from 10 languages in the Project Gutenberg eBooks collection, we find emphatic empirical support for the universality of our claim.
@article{Williams2015,
abstract = {Natural languages are full of rules and exceptions. One of the most famous quantitative rules is Zipf's law, which states that the frequency of occurrence of a word is approximately inversely proportional to its rank. Though this “law” of ranks has been found to hold across disparate texts and forms of data, analyses of increasingly large corpora since the late 1990s have revealed the existence of two scaling regimes. These regimes have thus far been explained by a hypothesis suggesting a separability of languages into core and noncore lexica. Here we present and defend an alternative hypothesis that the two scaling regimes result from the act of aggregating texts. We observe that text mixing leads to an effective decay of word introduction, which we show provides accurate predictions of the location and severity of breaks in scaling. Upon examining large corpora from 10 languages in the Project Gutenberg eBooks collection, we find emphatic empirical support for the universality of our claim.},
added-at = {2017-12-04T16:43:17.000+0100},
author = {Williams, Jake Ryland and Bagrow, James P. and Danforth, Christopher M. and Dodds, Peter Sheridan},
biburl = {https://www.bibsonomy.org/bibtex/2dc6b868f970d10e81536384cdf819660/vitelot},
description = {Phys. Rev. E 91, 052811 (2015) - Text mixing shapes the anatomy of rank-frequency distributions},
doi = {10.1103/PhysRevE.91.052811},
interhash = {4b22e31bcefda061cf3fe73376f26ff8},
intrahash = {dc6b868f970d10e81536384cdf819660},
journal = {Phys. Rev. E},
keywords = {mybook texts zipf},
month = may,
number = 5,
numpages = {8},
pages = 052811,
publisher = {American Physical Society},
timestamp = {2017-12-04T17:14:01.000+0100},
title = {Text mixing shapes the anatomy of rank-frequency distributions},
url = {https://link.aps.org/doi/10.1103/PhysRevE.91.052811},
volume = 91,
year = 2015
}