Statistical models usually require vector representations of categorical
variables, using for instance one-hot encoding. This strategy breaks down when
the number of categories grows, as it creates high-dimensional feature vectors.
Additionally, for string entries, one-hot encoding does not capture information
in their representation.Here, we seek low-dimensional encoding of
high-cardinality string categorical variables. Ideally, these should be:
scalable to many categories; interpretable to end users; and facilitate
statistical analysis. We introduce two encoding approaches for string
categories: a Gamma-Poisson matrix factorization on substring counts, and the
min-hash encoder, for fast approximation of string similarities. We show that
min-hash turns set inclusions into inequality relations that are easier to
learn. Both approaches are scalable and streamable. Experiments on real and
simulated data show that these methods improve supervised learning with
high-cardinality categorical variables. We recommend the following: if
scalability is central, the min-hash encoder is the best option as it does not
require any data fit; if interpretability is important, the Gamma-Poisson
factorization is the best alternative, as it can be interpreted as one-hot
encoding on inferred categories with informative feature names. Both models
enable autoML on the original string entries as they remove the need for
feature engineering or data cleaning.
%0 Generic
%1 cerda2019encoding
%A Cerda, Patricio
%A Varoquaux, Gaël
%D 2019
%K categorical encoder machinelearning
%R 10.1109/TKDE.2020.2992529
%T Encoding high-cardinality string categorical variables
%U http://arxiv.org/abs/1907.01860
%X Statistical models usually require vector representations of categorical
variables, using for instance one-hot encoding. This strategy breaks down when
the number of categories grows, as it creates high-dimensional feature vectors.
Additionally, for string entries, one-hot encoding does not capture information
in their representation.Here, we seek low-dimensional encoding of
high-cardinality string categorical variables. Ideally, these should be:
scalable to many categories; interpretable to end users; and facilitate
statistical analysis. We introduce two encoding approaches for string
categories: a Gamma-Poisson matrix factorization on substring counts, and the
min-hash encoder, for fast approximation of string similarities. We show that
min-hash turns set inclusions into inequality relations that are easier to
learn. Both approaches are scalable and streamable. Experiments on real and
simulated data show that these methods improve supervised learning with
high-cardinality categorical variables. We recommend the following: if
scalability is central, the min-hash encoder is the best option as it does not
require any data fit; if interpretability is important, the Gamma-Poisson
factorization is the best alternative, as it can be interpreted as one-hot
encoding on inferred categories with informative feature names. Both models
enable autoML on the original string entries as they remove the need for
feature engineering or data cleaning.
@misc{cerda2019encoding,
abstract = {Statistical models usually require vector representations of categorical
variables, using for instance one-hot encoding. This strategy breaks down when
the number of categories grows, as it creates high-dimensional feature vectors.
Additionally, for string entries, one-hot encoding does not capture information
in their representation.Here, we seek low-dimensional encoding of
high-cardinality string categorical variables. Ideally, these should be:
scalable to many categories; interpretable to end users; and facilitate
statistical analysis. We introduce two encoding approaches for string
categories: a Gamma-Poisson matrix factorization on substring counts, and the
min-hash encoder, for fast approximation of string similarities. We show that
min-hash turns set inclusions into inequality relations that are easier to
learn. Both approaches are scalable and streamable. Experiments on real and
simulated data show that these methods improve supervised learning with
high-cardinality categorical variables. We recommend the following: if
scalability is central, the min-hash encoder is the best option as it does not
require any data fit; if interpretability is important, the Gamma-Poisson
factorization is the best alternative, as it can be interpreted as one-hot
encoding on inferred categories with informative feature names. Both models
enable autoML on the original string entries as they remove the need for
feature engineering or data cleaning.},
added-at = {2020-11-06T20:00:04.000+0100},
author = {Cerda, Patricio and Varoquaux, Gaël},
biburl = {https://www.bibsonomy.org/bibtex/2883ecc4e1799457f17b0f3cb019d0bf6/cpankow},
description = {[1907.01860] Encoding high-cardinality string categorical variables},
doi = {10.1109/TKDE.2020.2992529},
interhash = {18af96110e1098dc591d2c300cbed2e3},
intrahash = {883ecc4e1799457f17b0f3cb019d0bf6},
keywords = {categorical encoder machinelearning},
note = {cite arxiv:1907.01860},
timestamp = {2020-11-06T20:00:04.000+0100},
title = {Encoding high-cardinality string categorical variables},
url = {http://arxiv.org/abs/1907.01860},
year = 2019
}