Introduction

The goal of this article is to create a machine learning model able to classify the distribution family of the data.

Methods

Parameters and Packages

# Plotting
library(ggplot2)

# Modelling
library(caret)
library(strip)
# library(parsnip)
# library(rsample)

# Optimizing
library(parallel)
library(doParallel)

# Analyzing
library(parameters)
library(bayestestR)
library(report)

set.seed(333)

Visualisation of distributions

distributions <- data.frame()
size <-  1000
for(location in round(seq(0.01, 10, length.out = 5), digits=2)){
  for(scale in round(seq(0.02, 10, length.out = 5), digits=2)){
    x <- parameters::normalize(as.data.frame(density(rnorm(size, location, scale), n=100)))
    x$distribution <- "normal"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rbeta(size, location, scale), n=100)))
    x$distribution <- "beta"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rbinom(size, round(location)+1, scale/10^(nchar(round(scale)))), n=100)))
    x$distribution <- "binomial"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rchisq(size, location, scale), n=100)))
    x$distribution <- "chi"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rexp(size, scale), n=100)))
    x$distribution <- "exponential"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rf(size, location, scale), n=100)))
    x$distribution <- "F"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rgamma(size, location, scale), n=100)))
    x$distribution <- "gamma"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rlnorm(size, location, scale), n=100)))
    x$distribution <- "lognormal"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rpois(size, location), n=100)))
    x$distribution <- "poisson"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rt(size, location, scale), n=100)))
    x$distribution <- "t"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(runif(size, location, location*2), n=100)))
    x$distribution <- "uniform"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rweibull(size, location, scale), n=100)))
    x$distribution <- "weibull"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
  }
}
ggplot(distributions, aes(x=x, y=y, colour=distribution)) +
  geom_line(size=1) +
  facet_grid(location ~ scale) +
  theme_classic()

Data generation

df <- data.frame()
for(distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")){
  for(i in 1:2000){
    
    size <- round(runif(1, 10, 2000))
    location <- runif(1, 0.01, 10)
    scale <- runif(1, 0.02, 10)
    
    x <- generate_distribution(distribution, size=size, location=location, scale=scale)
    
    x[is.infinite(x)] <- 5.565423e+156
    x_scaled <- parameters::normalize(x, verbose=FALSE)
    
    density_Z <- density(x_scaled, n=20)$y
    
    # Extract features
    data <- data.frame(
      "Mean" = mean(x_scaled),
      "SD" = sd(x_scaled),
      "Median" = median(x_scaled),
      "MAD" = mad(x_scaled, constant=1),
      "Mean_Median_Distance" = mean(x_scaled) - median(x_scaled),
      "Mean_Mode_Distance" = mean(x_scaled) - as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      "SD_MAD_Distance" = sd(x_scaled) - mad(x_scaled, constant=1),
      "Mode" = as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      # "Range" = range(x),
      "Range_SD" = diff(range(x)) / sd(x),
      "Range_MAD" = diff(range(x)) / mad(x, constant=1),
      "IQR" = stats::IQR(x_scaled),
      "Skewness" = skewness(x_scaled),
      "Kurtosis" = kurtosis(x_scaled),
      "Uniques" = length(unique(x)) / length(x),
      "Smoothness_Cor" = parameters::smoothness(density(x_scaled)$y, method="cor"),
      "Smoothness_Diff" = parameters::smoothness(density(x_scaled)$y, method="diff"),
      "Smoothness_Z_Cor_1" = parameters::smoothness(density_Z, method="cor", lag=1),
      "Smoothness_Z_Diff_1" = parameters::smoothness(density_Z, method="diff", lag=1),
      "Smoothness_Z_Cor_3" = parameters::smoothness(density_Z, method="cor", lag=3),
      "Smoothness_Z_Diff_3" = parameters::smoothness(density_Z, method="diff", lag=3)
    )
    
    
    density_df <- as.data.frame(t(density_Z))
    names(density_df) <- paste0("Density_", 1:ncol(density_df))
    data <- cbind(data, density_df)
    
    if(length(unique(x)) == 1){
      data$Distribution <- "uniform"
    } else{
      data$Distribution <- distribution
    }
    
    df <- rbind(df, data)
  }
  # write.csv(df, "classify_distribution.csv", row.names = FALSE)
}

Results

Model Selection

Best Model Inspection

model <- model_rf

# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))

# Prediction Table
knitr::kable(confusion$table / colSums(confusion$table), digits=2)

# Prediction Figure
perf <-  as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")
ggplot(perf, aes(x=Distribution, y=Metric, fill=Type)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  geom_hline(aes(yintercept=0.5), linetype="dotted") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Features
features <- caret::varImp(model, scale = TRUE)
plot(features)

# N trees
plot(model$finalModel, log="x", main="black default, red samplesize, green tree depth")

Random Forest Training with Feature Selection

Model Inspecction

# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))

# Prediction Table
knitr::kable(confusion$table / colSums(confusion$table), digits=2)

# Prediction Figure
perf <-  as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")
ggplot(perf, aes(x=Distribution, y=Metric, fill=Type)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  geom_hline(aes(yintercept=0.5), linetype="dotted") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Features
caret::varImp(model, scale = TRUE)

References

Package Version References
bayestestR 0.2.0 Makowski, D., Ben-Shachar M. S. & Lüdecke, D. (2019). Understand and Describe Bayesian Models and Posterior Distributions using bayestestR. CRAN. Available from https://github.com/easystats/bayestestR. DOI: 10.5281/zenodo.2556486.
caret 6.0.84 Max Kuhn. Contributions from Jed Wing, Steve Weston, Andre Williams, Chris Keefer, Allan Engelhardt, Tony Cooper, Zachary Mayer, Brenton Kenkel, the R Core Team, Michael Benesty, Reynald Lescarbeau, Andrew Ziem, Luca Scrucca, Yuan Tang, Can Candan and Tyler Hunt. (2019). caret: Classification and Regression Training. R package version 6.0-84. https://CRAN.R-project.org/package=caret
doParallel 1.0.14 Microsoft Corporation and Steve Weston (2018). doParallel: Foreach Parallel Adaptor for the ‘parallel’ Package. R package version 1.0.14. https://CRAN.R-project.org/package=doParallel
foreach 1.4.4 Microsoft and Steve Weston (2017). foreach: Provides Foreach Looping Construct for R. R package version 1.4.4. https://CRAN.R-project.org/package=foreach
ggplot2 3.1.1 H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.
iterators 1.0.10 Revolution Analytics and Steve Weston (2018). iterators: Provides Iterator Construct for R. R package version 1.0.10. https://CRAN.R-project.org/package=iterators
lattice 0.20.38 Sarkar, Deepayan (2008) Lattice: Multivariate Data Visualization with R. Springer, New York. ISBN 978-0-387-75968-5
parameters 0.1.0 Makowski, D. & Lüdecke, D. (2019). The report package for R: Ensuring the use of best practices for results reporting. CRAN. Available from https://github.com/easystats/report. doi: .
report 0.1.0 Makowski, D. & Lüdecke, D. (2019). The report package for R: Ensuring the use of best practices for results reporting. CRAN. Available from https://github.com/easystats/report. doi: .
strip 1.0.0 Paul Poncet (2018). strip: Lighten your R Model Outputs. R package version 1.0.0. https://CRAN.R-project.org/package=strip