An approach to the construction of classifiers from imbalanced datasets is described. A dataset is imbalanced if the classification categories are not approximately equally represented. Often real-world data sets are predominately composed of ``normal'' examples with only a small percentage of ``abnormal'' or ``interesting'' examples. It is also the case that the cost of misclassifying an abnormal (interesting) example as a normal example is often much higher than the cost of the reverse error. Under-sampling of the majority (normal) class has been proposed as a good means of increasing the sensitivity of a classifier to the minority class. This paper shows that a combination of our method of over-sampling the minority (abnormal) class and under-sampling the majority (normal) class can achieve better classifier performance (in ROC space) than only under-sampling the majority class. This paper also shows that a combination of our method of over-sampling the minority class and under-sampling the majority class can achieve better classifier performance (in ROC space) than varying the loss ratios in Ripper or class priors in Naive Bayes. Our method of over-sampling the minority class involves creating synthetic minority class examples. Experiments are performed using C4.5, Ripper and a Naive Bayes classifier. The method is evaluated using the area under the Receiver Operating Characteristic curve (AUC) and the ROC convex hull strategy.
Description
SMOTE: Synthetic Minority Over-sampling Technique | Journal of Artificial Intelligence Research
%0 Journal Article
%1 chawla2002smote
%A Chawla, N. V.
%A Bowyer, K. W.
%A Hall, L. O.
%A Kegelmeyer, W. P.
%D 2002
%J Journal of Artificial Intelligence Research
%K classification imbalanced learning machine smote
%P 321--357
%R 10.1613/jair.953
%T SMOTE: Synthetic Minority Over-sampling Technique
%U https://www.jair.org/index.php/jair/article/view/10302
%V 16
%X An approach to the construction of classifiers from imbalanced datasets is described. A dataset is imbalanced if the classification categories are not approximately equally represented. Often real-world data sets are predominately composed of ``normal'' examples with only a small percentage of ``abnormal'' or ``interesting'' examples. It is also the case that the cost of misclassifying an abnormal (interesting) example as a normal example is often much higher than the cost of the reverse error. Under-sampling of the majority (normal) class has been proposed as a good means of increasing the sensitivity of a classifier to the minority class. This paper shows that a combination of our method of over-sampling the minority (abnormal) class and under-sampling the majority (normal) class can achieve better classifier performance (in ROC space) than only under-sampling the majority class. This paper also shows that a combination of our method of over-sampling the minority class and under-sampling the majority class can achieve better classifier performance (in ROC space) than varying the loss ratios in Ripper or class priors in Naive Bayes. Our method of over-sampling the minority class involves creating synthetic minority class examples. Experiments are performed using C4.5, Ripper and a Naive Bayes classifier. The method is evaluated using the area under the Receiver Operating Characteristic curve (AUC) and the ROC convex hull strategy.
@article{chawla2002smote,
abstract = {An approach to the construction of classifiers from imbalanced datasets is described. A dataset is imbalanced if the classification categories are not approximately equally represented. Often real-world data sets are predominately composed of ``normal'' examples with only a small percentage of ``abnormal'' or ``interesting'' examples. It is also the case that the cost of misclassifying an abnormal (interesting) example as a normal example is often much higher than the cost of the reverse error. Under-sampling of the majority (normal) class has been proposed as a good means of increasing the sensitivity of a classifier to the minority class. This paper shows that a combination of our method of over-sampling the minority (abnormal) class and under-sampling the majority (normal) class can achieve better classifier performance (in ROC space) than only under-sampling the majority class. This paper also shows that a combination of our method of over-sampling the minority class and under-sampling the majority class can achieve better classifier performance (in ROC space) than varying the loss ratios in Ripper or class priors in Naive Bayes. Our method of over-sampling the minority class involves creating synthetic minority class examples. Experiments are performed using C4.5, Ripper and a Naive Bayes classifier. The method is evaluated using the area under the Receiver Operating Characteristic curve (AUC) and the ROC convex hull strategy.},
added-at = {2021-10-05T09:30:20.000+0200},
author = {Chawla, N. V. and Bowyer, K. W. and Hall, L. O. and Kegelmeyer, W. P.},
biburl = {https://www.bibsonomy.org/bibtex/2276237a0e413c61d62f0fa3f0ee2dbc9/sdo},
description = {SMOTE: Synthetic Minority Over-sampling Technique | Journal of Artificial Intelligence Research},
doi = {10.1613/jair.953},
id = {10302},
interhash = {96e7fb52eb7ece76e71678ac2d2efbfc},
intrahash = {276237a0e413c61d62f0fa3f0ee2dbc9},
issn = {1076-9757},
journal = {Journal of Artificial Intelligence Research},
keywords = {classification imbalanced learning machine smote},
pages = {321--357},
source = {Journal of Artificial Intelligence Research},
timestamp = {2021-10-05T09:30:20.000+0200},
title = {SMOTE: Synthetic Minority Over-sampling Technique},
type = {Text.Serial.Journal},
uri = {https://www.jair.org/index.php/jair},
url = {https://www.jair.org/index.php/jair/article/view/10302},
volume = 16,
year = 2002
}