library(factorMerger)
library(ggplot2)
library(dplyr)
library(reshape2)
data("pisa2012")
pisa2012 %>% ggplot(aes(x = CNT)) + geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
meltedPisa <- pisa2012 %>% melt(na.rm = TRUE)
pisaResultsBySubject <- meltedPisa %>%
ggplot(aes(x = reorder(CNT, value, FUN = median), y = value)) + geom_boxplot() +
facet_wrap(~variable) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Country")
pisaResultsBySubject
pisaResultsBySubject +
geom_hline(data = meltedPisa %>% group_by(variable) %>% summarise(mean = mean(value)),
aes(yintercept = mean, group = variable), col = "red")
TODO: Find countries significantly better, worse and not significantly different from global averages. Cluster countries into three groups.
manova(cbind(PV1MATH, PV1READ, PV1SCIE) ~ CNT, pisa2012) %>% summary()
#> Df Pillai approx F num Df den Df Pr(>F)
#> CNT 42 0.32207 776.81 126 813837 < 2.2e-16 ***
#> Residuals 271279
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It seems that there exist some differences among countries included in PISA. Let’s find them!
pisaIdxSubset <- sample(1:nrow(pisa2012), size = 500)
pisaFM <- mergeFactors(pisa2012[pisaIdxSubset, 1:3],
factor(pisa2012$CNT[pisaIdxSubset]))
pisaFM
#> Family: gaussian Factor Merger.
#> Factor levels were recoded as below:
#>
#> recoded original
#> -------- -------------------------
#> (UnAE) United Arab Emirates
#> (Astrl) Australia
#> (Austr) Austria
#> (Blgm) Belgium
#> (Blgr) Bulgaria
#> (Brzl) Brazil
#> (Cand) Canada
#> (Chil) Chile
#> (Clmb) Colombia
#> (CzcR) Czech Republic
#> (Grmn) Germany
#> (Dnmr) Denmark
#> (Span) Spain
#> (Fnln) Finland
#> (Frnc) France
#> (UntK) United Kingdom
#> (Hngr) Hungary
#> (Irln) Ireland
#> (Isrl) Israel
#> (Itly) Italy
#> (Japn) Japan
#> (Kore) Korea
#> (Mlys) Malaysia
#> (Nthr) Netherlands
#> (Nrwy) Norway
#> (Plnd) Poland
#> (Ch-S) China-Shanghai
#> (RssF) Russian Federation
#> (Sngp) Singapore
#> (Serb) Serbia
#> (SlvR) Slovak Republic
#> (Swdn) Sweden
#> (ChnT) Chinese Taipei
#> (Trky) Turkey
#> (USoA) United States of America
#>
#> Factor levels were merged in the following order:
#>
#> groupA groupB model GIC pvalVsFull pvalVsPrevious
#> ---- -------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------------------- ---------- --------- ----------- ---------------
#> 1 -8045.028 16160.06 1.0000 1.0000
#> 11 (Nthr) (Chil) -8045.051 16158.10 0.9977 0.9977
#> 12 (Swdn) (Fnln) -8045.105 16156.21 0.9999 0.9919
#> 13 (Swdn)(Fnln) (Blgr) -8045.196 16154.39 1.0000 0.9822
#> 14 (Blgm) (Ch-S) -8045.298 16152.60 1.0000 0.9790
#> 15 (Mlys) (Sngp) -8045.419 16150.84 1.0000 0.9734
#> 16 (USoA) (Frnc) -8045.544 16149.09 1.0000 0.9719
#> 17 (Hngr) (Nthr)(Chil) -8045.681 16147.36 1.0000 0.9679
#> 18 (ChnT) (SlvR) -8045.845 16145.69 1.0000 0.9584
#> 19 (Irln) (Kore) -8046.032 16144.06 1.0000 0.9498
#> 110 (UntK) (USoA)(Frnc) -8046.250 16142.50 1.0000 0.9375
#> 111 (Japn) (CzcR) -8046.536 16141.07 1.0000 0.9097
#> 112 (Dnmr) (Brzl) -8046.834 16139.67 1.0000 0.9043
#> 113 (RssF) (Itly) -8047.239 16138.48 1.0000 0.8566
#> 114 (Irln)(Kore) (Blgm)(Ch-S) -8047.650 16137.30 1.0000 0.8536
#> 115 (Astrl) (Plnd) -8048.064 16136.13 1.0000 0.8514
#> 116 (Austr) (ChnT)(SlvR) -8048.508 16135.02 1.0000 0.8374
#> 117 (Mlys)(Sngp) (Span) -8048.972 16133.94 1.0000 0.8283
#> 118 (Isrl) (Irln)(Kore)(Blgm)(Ch-S) -8049.469 16132.94 1.0000 0.8118
#> 119 (Nrwy) (Cand) -8050.310 16132.62 1.0000 0.6550
#> 120 (Trky) (UntK)(USoA)(Frnc) -8051.336 16132.67 1.0000 0.5767
#> 121 (Astrl)(Plnd) (Grmn) -8052.566 16133.13 1.0000 0.4975
#> 122 (UnAE) (Swdn)(Fnln)(Blgr) -8053.808 16133.62 1.0000 0.4924
#> 123 (Hngr)(Nthr)(Chil) (RssF)(Itly) -8055.242 16134.48 1.0000 0.4258
#> 124 (Dnmr)(Brzl) (Clmb) -8056.878 16135.76 1.0000 0.3641
#> 125 (Dnmr)(Brzl)(Clmb) (Serb) -8058.819 16137.64 1.0000 0.2858
#> 126 (Austr)(ChnT)(SlvR) (Hngr)(Nthr)(Chil)(RssF)(Itly) -8060.877 16139.75 1.0000 0.2590
#> 127 (Austr)(ChnT)(SlvR)(Hngr)(Nthr)(Chil)(RssF)(Itly) (Mlys)(Sngp)(Span) -8063.868 16143.74 1.0000 0.1189
#> 128 (Nrwy)(Cand) (Isrl)(Irln)(Kore)(Blgm)(Ch-S) -8066.977 16147.95 1.0000 0.1069
#> 129 (UnAE)(Swdn)(Fnln)(Blgr) (Trky)(UntK)(USoA)(Frnc) -8070.437 16152.87 0.9997 0.0785
#> 130 (Astrl)(Plnd)(Grmn) (Japn)(CzcR) -8075.792 16161.58 0.9954 0.0144
#> 131 (UnAE)(Swdn)(Fnln)(Blgr)(Trky)(UntK)(USoA)(Frnc) (Astrl)(Plnd)(Grmn)(Japn)(CzcR) -8088.486 16184.97 0.7627 0.0000
#> 132 (Dnmr)(Brzl)(Clmb)(Serb) (Austr)(ChnT)(SlvR)(Hngr)(Nthr)(Chil)(RssF)(Itly)(Mlys)(Sngp)(Span) -8105.111 16216.22 0.1084 0.0000
#> 133 (UnAE)(Swdn)(Fnln)(Blgr)(Trky)(UntK)(USoA)(Frnc)(Astrl)(Plnd)(Grmn)(Japn)(CzcR) (Nrwy)(Cand)(Isrl)(Irln)(Kore)(Blgm)(Ch-S) -8122.216 16248.43 0.0013 0.0000
#> 134 (Dnmr)(Brzl)(Clmb)(Serb)(Austr)(ChnT)(SlvR)(Hngr)(Nthr)(Chil)(RssF)(Itly)(Mlys)(Sngp)(Span) (UnAE)(Swdn)(Fnln)(Blgr)(Trky)(UntK)(USoA)(Frnc)(Astrl)(Plnd)(Grmn)(Japn)(CzcR)(Nrwy)(Cand)(Isrl)(Irln)(Kore)(Blgm)(Ch-S) -8171.413 16344.83 0.0000 0.0000
plot(pisaFM, responsePanel = "profile")
It’s faster to use ‘hclust’ method on a big dataset.
pisaFMHClustMath <- mergeFactors(pisa2012[, 1:3],
factor(pisa2012$CNT),
method = "hclust",
successive = TRUE)
plot(pisaFMHClustMath)
pisaFMHClust <- mergeFactors(pisa2012[, 1:3],
factor(pisa2012$CNT),
method = "hclust",
successive = FALSE)
plot(pisaFMHClust)
Let’s now have a try using European countries only.
pisaEuropean <- filter(pisa2012, CNT %in% c("Austria", "Belgium", "Bulgaria",
"Czech Republic", "Germany", "Denmark",
"Spain", "Estonia", "Finland",
"France", "Hungary", "Ireland",
"Italy", "Netherlands", "Norway",
"Poland", "Portugal",
"Russian Federation", "Slovak Republic",
"Slovenia"))
pisaFMHClustEurope <- mergeFactors(pisaEuropean[,1:3],
factor(pisaEuropean$CNT),
method = "hclust",
successive = TRUE)
plot(pisaFMHClustEurope)