mclust
is a contributed R package for model-based clustering, classification, and density estimation based on finite normal mixture modelling. It provides functions for parameter estimation via the EM algorithm for normal mixture models with a variety of covariance structures, and functions for simulation from these models. Also included are functions that combine model-based hierarchical clustering, EM for mixture estimation and the Bayesian Information Criterion (BIC) in comprehensive strategies for clustering, density estimation and discriminant analysis. Additional functionalities are available for displaying and visualizing fitted models along with clustering, classification, and density estimation results.
This document gives a quick tour of mclust
functionalities. It was written in R Markdown, using the knitr
package for production. It corresponds to mclust
version 5.0.0. See the help pages for further details and references provided by citation("mclust")
.
library(mclust)
## Package 'mclust' version 5.0.0
## Type 'citation("mclust")' for citing this R package in publications.
data(diabetes)
class = diabetes$class
table(class)
## class
## Chemical Normal Overt
## 36 76 33
X = diabetes[,-1]
head(X)
## glucose insulin sspg
## 1 80 356 124
## 2 97 289 117
## 3 105 319 143
## 4 90 356 199
## 5 90 323 240
## 6 86 381 157
clPairs(X, class)
BIC = mclustBIC(X)
plot(BIC)
summary(BIC)
## Best BIC values:
## VVV,3 VVE,3 EVE,4
## BIC -4770.044 -4785.48868 -4803.21521
## BIC diff 0.000 -15.44456 -33.17109
mod1 = Mclust(X)
summary(mod1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
##
## log.likelihood n df BIC ICL
## -2307.883 145 29 -4760.091 -4776.086
##
## Clustering table:
## 1 2 3
## 82 33 30
##
## Mixing probabilities:
## 1 2 3
## 0.5618662 0.2233077 0.2148261
##
## Means:
## [,1] [,2] [,3]
## glucose 91.41166 105.1978 219.40355
## insulin 358.82811 517.0420 1041.36946
## sspg 166.15629 320.7894 98.33493
##
## Variances:
## [,,1]
## glucose insulin sspg
## glucose 61.94482 98.79404 35.04002
## insulin 98.79404 2123.14811 387.06112
## sspg 35.04002 387.06112 2681.21187
## [,,2]
## glucose insulin sspg
## glucose 153.2207 795.5581 -494.7416
## insulin 795.5581 6513.7790 -2846.4416
## sspg -494.7416 -2846.4416 26074.9825
## [,,3]
## glucose insulin sspg
## glucose 6344.602 26160.77 -4433.706
## insulin 26160.768 122003.26 -22714.951
## sspg -4433.706 -22714.95 5892.777
plot(mod1, what = "classification")
table(class, mod1$classification)
##
## class 1 2 3
## Chemical 8 26 2
## Normal 74 2 0
## Overt 0 5 28
ICL = mclustICL(X)
summary(ICL)
## Best ICL values:
## VVV,3 VVE,3 EVE,4
## ICL -4776.086 -4793.2680 -4809.16854
## ICL diff 0.000 -17.1821 -33.08265
plot(ICL)
LRT = mclustBootstrapLRT(X, modelName = "VVV")
LRT
## Bootstrap sequential LRT for the number of mixture components
## -------------------------------------------------------------
## Model = VVV
## Replications = 999
## LRTS bootstrap p-value
## 1 vs 2 361.186445 0.001
## 2 vs 3 114.703559 0.001
## 3 vs 4 7.437806 0.939
data(iris)
class = iris$Species
table(class)
## class
## setosa versicolor virginica
## 50 50 50
X = iris[,1:4]
head(X)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
mod2 = MclustDA(X, class, modelType = "EDDA")
summary(mod2)
## ------------------------------------------------
## Gaussian finite mixture model for classification
## ------------------------------------------------
##
## EDDA model summary:
##
## log.likelihood n df BIC
## -187.7097 150 38 -565.8236
##
## Classes n Model G
## setosa 50 VEV 1
## versicolor 50 VEV 1
## virginica 50 VEV 1
##
## Training classification summary:
##
## Predicted
## Class setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 47 3
## virginica 0 0 50
##
## Training error = 0.02
plot(mod2, what = "scatterplot")
plot(mod2, what = "classification")
data(banknote)
class = banknote$Status
table(class)
## class
## counterfeit genuine
## 100 100
X = banknote[,-1]
head(X)
## Length Left Right Bottom Top Diagonal
## 1 214.8 131.0 131.1 9.0 9.7 141.0
## 2 214.6 129.7 129.7 8.1 9.5 141.7
## 3 214.8 129.7 129.7 8.7 9.6 142.2
## 4 214.8 129.7 129.6 7.5 10.4 142.0
## 5 215.0 129.6 129.7 10.4 7.7 141.8
## 6 215.7 130.8 130.5 9.0 10.1 141.4
mod3 = MclustDA(X, class)
summary(mod3)
## ------------------------------------------------
## Gaussian finite mixture model for classification
## ------------------------------------------------
##
## MclustDA model summary:
##
## log.likelihood n df BIC
## -646.0798 200 66 -1641.848
##
## Classes n Model G
## counterfeit 100 EVE 2
## genuine 100 XXX 1
##
## Training classification summary:
##
## Predicted
## Class counterfeit genuine
## counterfeit 100 0
## genuine 0 100
##
## Training error = 0
plot(mod3, what = "scatterplot")
plot(mod3, what = "classification")
unlist(cvMclustDA(mod2, nfold = 10)[2:3])
## error se
## 0.02666667 0.01088662
unlist(cvMclustDA(mod3, nfold = 10)[2:3])
## error se
## 0.005 0.005
data(acidity)
mod4 = densityMclust(acidity)
summary(mod4)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling
## -------------------------------------------------------
##
## Mclust E (univariate, equal variance) model with 2 components:
##
## log.likelihood n df BIC ICL
## -185.9493 155 4 -392.0723 -398.5554
##
## Clustering table:
## 1 2
## 98 57
plot(mod4, what = "BIC")
plot(mod4, what = "density", data = acidity, breaks = 15)
plot(mod4, what = "diagnostic", type = "cdf")
plot(mod4, what = "diagnostic", type = "qq")
data(faithful)
mod5 = densityMclust(faithful)
summary(mod5)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling
## -------------------------------------------------------
##
## Mclust EEE (ellipsoidal, equal volume, shape and orientation) model with 3 components:
##
## log.likelihood n df BIC ICL
## -1126.361 272 11 -2314.386 -2360.865
##
## Clustering table:
## 1 2 3
## 130 97 45
plot(mod5, what = "BIC")
plot(mod5, what = "density")
plot(mod5, what = "density", type = "image",
col = "dodgerblue3", grid = 100)
plot(mod5, what = "density", type = "persp")
boot1 = MclustBootstrap(mod1)
summary(boot1, what = "se")
## Bootstrap standard errors
## ----------------------------------------
## Model = VVV
## Num. of mixture components = 3
## Replications = 999
##
## Mixing probabilities:
## 1 2 3
## 0.05016287 0.04563044 0.03844920
##
## Means:
## 1 2 3
## glucose 1.048604 3.72232 17.52620
## insulin 7.436813 25.51307 75.88028
## sspg 7.253418 33.91974 17.05377
##
## Variances:
## [,,1]
## glucose insulin sspg
## glucose 11.18436 51.67467 52.06727
## insulin 51.67467 492.85449 364.44729
## sspg 52.06727 364.44729 560.23662
## [,,2]
## glucose insulin sspg
## glucose 62.27318 467.1592 491.5765
## insulin 467.15918 3561.9528 3293.8502
## sspg 491.57655 3293.8502 7135.8855
## [,,3]
## glucose insulin sspg
## glucose 1123.923 5964.213 1761.580
## insulin 5964.213 37241.661 10919.481
## sspg 1761.580 10919.481 3156.073
summary(boot1, what = "ci")
## Bootstrap confidence intervals
## ----------------------------------------
## Model = VVV
## Num. of mixture components = 3
## Replications = 999
## Confidence level = 0.95
##
## Mixing probabilities:
## 1 2 3
## 2.5% 0.4644103 0.1434245 0.1401601
## 97.5% 0.6519155 0.3232752 0.2925669
##
## Means:
## [,,1]
## glucose insulin sspg
## 2.5% 89.33215 343.5801 151.9062
## 97.5% 93.36431 373.2421 180.1879
## [,,2]
## glucose insulin sspg
## 2.5% 98.7532 472.1172 255.3913
## 97.5% 113.9337 578.6053 390.7337
## [,,3]
## glucose insulin sspg
## 2.5% 189.4671 900.8685 68.43588
## 97.5% 255.2770 1191.9511 133.25715
##
## Variances:
## [,,1]
## glucose insulin sspg
## 2.5% 40.15036 1218.370 1637.185
## 97.5% 82.28988 3027.744 3912.103
## [,,2]
## glucose insulin sspg
## 2.5% 63.46492 2020.641 12675.49
## 97.5% 330.39902 16497.320 40323.10
## [,,3]
## glucose insulin sspg
## 2.5% 3988.261 56457.91 1542.744
## 97.5% 8358.278 197156.39 12434.134
boot4 = MclustBootstrap(mod4)
summary(boot4, what = "se")
## Bootstrap standard errors
## ----------------------------------------
## Model = E
## Num. of mixture components = 2
## Replications = 999
##
## Mixing probabilities:
## 1 2
## 0.04124314 0.04124314
##
## Means:
## 1 2
## 0.04640443 0.06854613
##
## Variances:
## 1 2
## 0.02375279 0.02375279
summary(boot4, what = "ci")
## Bootstrap confidence intervals
## ----------------------------------------
## Model = E
## Num. of mixture components = 2
## Replications = 999
## Confidence level = 0.95
##
## Mixing probabilities:
## 1 2
## 2.5% 0.5381047 0.2981008
## 97.5% 0.7018992 0.4618953
##
## Means:
## 1 2
## 2.5% 4.280819 6.185921
## 97.5% 4.461697 6.451439
##
## Variances:
## 1 2
## 2.5% 0.1420548 0.1420548
## 97.5% 0.2352836 0.2352836
mod1dr = MclustDR(mod1)
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: Mclust (VVV, 3)
##
## Clusters n
## 1 82
## 2 33
## 3 30
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3
## glucose -0.986035 0.23503 0.958753
## insulin 0.157678 -0.10980 -0.284026
## sspg -0.053589 -0.96577 -0.011029
##
## Dir1 Dir2 Dir3
## Eigenvalues 1.375 0.77745 0.65837
## Cum. % 48.919 76.57772 100.00000
plot(mod1dr, what = "pairs")
plot(mod1dr, what = "boundaries", ngrid = 200)
mod1dr = MclustDR(mod1, lambda = 1)
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: Mclust (VVV, 3)
##
## Clusters n
## 1 82
## 2 33
## 3 30
##
## Estimated basis vectors:
## Dir1 Dir2
## glucose 0.80928 0.92579
## insulin -0.56443 -0.19376
## sspg -0.16272 -0.32461
##
## Dir1 Dir2
## Eigenvalues 1.0589 0.39905
## Cum. % 72.6294 100.00000
plot(mod1dr, what = "scatterplot")
plot(mod1dr, what = "boundaries", ngrid = 200)
mod2dr = MclustDR(mod2)
summary(mod2dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: EDDA
##
## Classes n Model G
## setosa 50 VEV 1
## versicolor 50 VEV 1
## virginica 50 VEV 1
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3 Dir4
## Sepal.Length 0.17425 -0.193663 0.64081 -0.46231
## Sepal.Width 0.45292 0.066561 0.34852 0.57110
## Petal.Length -0.61629 -0.311030 -0.42366 0.46256
## Petal.Width -0.62024 0.928076 0.53703 -0.49613
##
## Dir1 Dir2 Dir3 Dir4
## Eigenvalues 0.94747 0.68835 0.076141 0.052607
## Cum. % 53.69408 92.70374 97.018700 100.000000
plot(mod2dr, what = "scatterplot")
plot(mod2dr, what = "boundaries", ngrid = 200)
mod3dr = MclustDR(mod3)
summary(mod3dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: MclustDA
##
## Classes n Model G
## counterfeit 100 EVE 2
## genuine 100 XXX 1
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3 Dir4 Dir5 Dir6
## Length -0.10053 -0.32853 0.797138 -0.033156 -0.3177204 0.084867
## Left -0.21757 -0.30448 -0.303012 -0.893163 0.3688689 -0.565765
## Right 0.29197 -0.01844 -0.495823 0.407884 -0.8616616 0.481501
## Bottom 0.57597 0.44534 0.120202 -0.034503 0.0042677 -0.078642
## Top 0.57549 0.38554 0.100993 -0.103770 0.1360968 0.625145
## Diagonal -0.44088 0.67235 -0.047607 -0.151104 -0.0445899 0.209460
##
## Dir1 Dir2 Dir3 Dir4 Dir5 Dir6
## Eigenvalues 0.87241 0.55366 0.48558 0.13317 0.053192 0.027221
## Cum. % 41.05017 67.10202 89.95013 96.21627 98.719164 100.000000
plot(mod3dr, what = "scatterplot")
plot(mod3dr, what = "boundaries", ngrid = 200)