.packageName <- "mdqc"
##  code by Justin Harrington

## mdqc - Methods:
##  "nogroups" - use all columns
##  "apriori" - use groups of columns (elements of a list)
##  "global" - global pca (need to specify how many principal components)
##  "cluster" - use (pam) to cluster columns (need to specify how many clusters)
##  "loading" - loading PCA method by clustering in the space of the loadings (need to specify how many clusters & PCs)

################
## Main Function
################
mdqc <- function(x, method=c("nogroups", "apriori", "global", "cluster", "loading"),
                 groups=NULL, k=NULL, pc=NULL, robust=c("S-estimator","MCD", "MVE"),
                 nsamp=10*nrow(x)){
  ## x - dataset. Matrix or dataframe. Must be numeric
  ## method (see above)
  ## groups - for apriori method, specify groups as a list. E.g. groups = list(c(1,2), c(4,6)) puts column 1,2 as one group and 4,6 as a second
  ## k - number of cluster (cluster method & loading method)
  ## pc - number of principal components to use (global PCA method $ loading method)
  ## robust - which robust measure of location/spread (choice of S-estimator or MCD)
  
  ## Perform argument match for method & rmethod - if doesn't match, will fail
  method <- match.arg(method)
  robust <- match.arg(robust)
  rmethod <- switch(robust, "S-estimator"="mySm",
                    "MCD"="cov.mcd",
                    "MVE"="cov.mve")
  
  ## data validation on x
  x <- as.matrix(x)
  if (!is.numeric(x)) stop("The matrix x does not appear to be numeric.")
  if (dim(x)[1] < dim(x)[2] & method == "nogroups") stop("The matrix x has less rows than columns.")

  ## Argument validation (on k, pc, groups etc)
  if (method=="apriori"){
    if (is.null(groups)) stop("You have selected the a-priori method without specifying groupings.")
    if (any(sapply(groups, length) > nrow(x))) stop("The matrix x has less rows than columns in at least one group.")
  }
  if (method %in% c("global","loading")){
    if (pc > nrow(x)) stop("The matrix x has less rows than principal components.")
    if (is.null(pc)) stop("You need to specify the number of principal components for the ", method, " method.")
    if (pc < 1) stop("The number of principal components must be an integer greater than 0.")
  }
  if (method %in% c("cluster","loading")){
    if (is.null(k)) stop("You need to specifiy the number of clusters for the ", method, " method.")
    if (k < 1) stop("The number of clusters must be an integer greater than 0.")
  }
  
  ## By default
  y <- x 
  ## otherwise create transformed space (in case of global or loading)
  if (method %in% c("global", "loading")){
    ## Perform Robust PCA
    robPCA <- prcomp.robust(x, robust, nsamp=nsamp)
    if (method == "global") ## use this dataset for glocal PCA method
      y <- robPCA$x
  }

  ## create groupings when method is cluster/loading
  if (method == "cluster"){
    robCov <- do.call(rmethod, list(x=y, nsamp=nsamp))$cov
    robCor <- diag(1/sqrt(diag(robCov)))%*%robCov%*%diag(1/sqrt(diag(robCov)))
    robDist <- as.dist((1-robCor)/2)
    clustout <- pam(x=robDist, k=k, cluster.only=TRUE)
    if (any(table(clustout) > nrow(x))) stop("The matrix x has less rows than columns in at least one of the groups from clustering.")
  }

  if (method == "loading"){
    clustout <- pam(x=robPCA$rotation[, (1:pc)], metric="manhattan", k=k, cluster.only=TRUE) 
  }

  ## apply method to form groups
  groups <- switch(method,
                   "nogroups"=list(1:ncol(y)),
                   "global"=list(1:pc),
                   "apriori"=groups,
                   "cluster"=,"loading"=split(1:ncol(y), clustout))
  
  ngroups <- length(groups)

  ## Calculate mahalanobis distances
  mdqcValues <- list()
  for (i in 1:ngroups){
    if (method == "global"){
      robMdqc <- do.call(rmethod, list(x=y, nsamp=nsamp))
      robMdqc$center <- robMdqc$center[1:pc]
      robMdqc$cov <- robMdqc$cov[1:pc, 1:pc]
    }
    else
      robMdqc <- do.call(rmethod, list(x=y[, groups[[i]] ], nsamp=nsamp))
      
    if (length(groups[[i]]) > 1)
      mdqcValues[[i]] <- sqrt(mahalanobis(y[, groups[[i]] ], robMdqc$center, robMdqc$cov))
    else ## If number in group is one
      mdqcValues[[i]] <- abs(scale(y[, groups[[i]] ], center=robMdqc$center, scale=sqrt(robMdqc$cov)))
  }

  ## Return data
  mdqcObject <- list(ngroups=ngroups, groups=groups, mdqcValues=mdqcValues, x=x, method=method, robust=robust, pc=pc, k=k)
  class(mdqcObject) <- "mdqc"
  return(mdqcObject)
}


#####################
## robust prcomp
#####################

prcomp.robust <- function(x, robust=c("S-estimator","MCD", "MVE"), nsamp=10*nrow(x), ...){
  robust <- match.arg(robust)
  rmethod <- switch(robust,
                    "S-estimator"="mySm",
                    "MCD"="cov.mcd",
                    "MVE"="cov.mve") 
  
  ## Standardize data using robust esimates
  robEstimates <- do.call(rmethod, list(x=x, nsamp=nsamp))
  scaledData <- scale(x, center=robEstimates$center, scale=sqrt(diag(robEstimates$cov)))
  
  ## Use robust estimates for calculating the PCs
  robEstimates <- do.call(rmethod, list(x=scaledData, nsamp=nsamp))
  eigenData <- svd(robEstimates$cov)
  y <- scaledData %*% eigenData$v
  outputs <- list(sdev=sqrt(eigenData$d), rotation=eigenData$v, x=y)
  dimnames(outputs$rotation) <- list(colnames(x), paste("PC",1:ncol(x), sep=""))
  class(outputs) <- "prcomp"
  return(outputs)
}


#####################
########## S3 Methods
#####################
plot.mdqc <- function(x, levels=c(0.9, 0.95, 0.99), xlab="", ylab="", mfrow=NULL, mfcol=NULL, ...){
  if (x$ngroups > 1){
    ## Set up mfrow (for multiple plots)
    if (is.null(mfrow) & is.null(mfcol)){
      mfdim <- ceiling(sqrt(x$ngroups)) ## Lazy default
      opar <- par(mfrow=c(ceiling(x$ngroups/mfdim), mfdim))
    }
    else
      opar <- par(mfrow=mfrow, mfcol=mfcol)
    on.exit(par(opar))
  }

  index <- 1:nrow(x$x)

  for (i in 1:x$ngroups){
    r <- length(x$groups[[i]])
    plot(index, x$mdqcValues[[i]], xlab=xlab, ylab=ylab, main=ifelse(x$ngroups > 1, paste("Group",i), ""), ...)
    abline(h=sqrt(qchisq(levels, r)), lwd=1.3, lty=1:3)
    flag <- which(x$mdqcValues[[i]] > sqrt(qchisq(min(levels), r)))
    points(index[flag], x$mdqcValues[[i]][flag], pch=19, col=1)
    text(index[flag], x$mdqcValues[[i]][flag]-par("cxy")[2], flag)
  }
}

summary.mdqc <- function(object, ...) {
  cat("\nSummary information for MDQC\n")
  cat("Method used:", object$method, "\tNumber of groups:", object$ngroups, "\nRobust estimator:", object$robust)
  if (object$method %in% c("loading", "global"))
    cat("\tNumber of Principal Components:", object$pc,"\n")
  cat("\nNumber of Outliers:\n")
  for (i in 1:object$ngroups) {
    if (object$ngroups > 1) {
      cat("Group", i, " - Columns\n")
      print(object$group[[i]])
    }
    r <- length(object$groups[[i]])
    cat("90%\t95%\t99%\n")
    for (j in 1:3){
      crit <- c(0.9, 0.95, 0.99)[j]
      flag <- which(object$mdqcValues[[i]] > sqrt(qchisq(crit, r)))
      cat(length(flag), "\t")
    }
    cat("\n")
  }
}

print.mdqc <- function(x, ... ){
  index <- 1:nrow(x$x)
  cat("Method used:", x$method, "\tNumber of groups:", x$ngroups, "\nRobust estimator:", x$robust)
  if (x$method %in% c("loading", "global"))
    cat("\tNumber of Principal Components:", x$pc,"\n")

  for (i in 1:x$ngroups) {
    if (!(x$method %in% c("global", "nogroups"))) {
      cat("Group", i," - Columns\n")
      print(x$group[[i]])
    }
    r <- length(x$groups[[i]])
    for (j in 1:3){
      crit <- c(0.9, 0.95, 0.99)[j]
      cat("MDs exceeding the square root of the ", crit*100, "% percentile of the Chi-Square distribution\n")
      flag <- which(x$mdqcValues[[i]] > sqrt(qchisq(crit, r)))
      print(index[flag])
    }
    cat("\n")
  }
}

##Functions for Matlab Sm.m function:

##MADNOC

madnoc <- function(y){
  y <- abs(y)
  if (is.matrix(y)==0) {
    y <- t(as.matrix(y))}
  n <- nrow(y)
  p <- ncol(y)
  if((n==1)&(p>1)){
    y <- t(y)
    n <- nrow(y)
    p <- ncol(y)
  }
  y <- apply(y, 2, sort)                 
  if (floor(n/2)==n/2){
    mad <- (y[n/2,]+y[(n+2)/2,])/2}
  else {
    mad <- y[(n+1)/2,]
  }
  mad <- mad/.6745
  return(mad)
}

##--------------------------------------------------------------------
##MAHALANOBIS
##build in function in S:

##mahalanobis(x, center, cov, inverted=F) 

##--------------------------------------------------------------------
##KSIINT

ksiint <- function(c, s, p){
  (2^s)*gamma(s+p/2)*pgamma(c^2/2, s+p/2)/gamma(p/2)
}
##--------------------------------------------------------------------
## Computes Tukey's biweight psi function with constant c for all 
## values in the vector x.

psibiweight <- function(x, c){
  hulp <- x-2*x^3/(c^2)+x^5/(c^4)
  psi <- hulp*(abs(x)<c)
  return(psi)
}

##--------------------------------------------------------------------
##UNIRAN
## The random generator.

uniran <- function(seed){
  seed <- floor(seed*5761)+999
  quot <- floor(seed/65536)
  seed <- floor(seed)-floor(quot*65536)
  random <- seed/65536     ##.D0???
  return(list(seed, random))
}

##--------------------------------------------------------------------
##RANDOMSET

## This function is called if not all (p+1)-subsets out of n will be 
## considered. It randomly draws a subsample of nel cases out of tot. 

## tot must be greater than nel!!

randomset <- function(tot, nel, seed){
  ranset <- rep(0, nel)
  for (j in 1:nel){
    random <- uniran(seed)[[2]]
    seed <- uniran(seed)[[1]]
    num <- floor(random*tot)+1
    if (j>1){    
      while (sum(ranset==num)>0){
        random <- uniran(seed)[[2]]
        seed <- uniran(seed)[[1]]
        num <- floor(random*tot)+1
      }
    }
    ranset[j:nel] <- num
  }
  return(list(ranset, seed))
}
##--------------------------------------------------------------------
##RHOBIWEIGHT

rhobiweight <- function(x, c){
  hulp <- x^2/2-x^4/(2*c^2)+x^6/(6*c^4)
  rho <- hulp*(abs(x)<c)+c^2/6*(abs(x)>=c)
  return(rho)
}

##--------------------------------------------------------------------
##TBSB

Tbsb <- function(c, p){
  y1 <- ksiint(c, 1, p)*3/c-ksiint(c, 2, p)*3/(c^3)+ksiint(c, 3, p)/(c^5)
  y2 <- c*(1-pchisq(c^2, p))
  res <- y1+y2
  return(res)
}

##--------------------------------------------------------------------
##TBSC

## constant for Tukey Biweight S 

Tbsc <- function(alpha, p){
  talpha <-  sqrt(qchisq(1-alpha, p))
  maxit <- 1000
  eps <- 10^(-8)
  diff <- 10^6
  ctest <- talpha
  iter <- 1
  while ((diff>eps)& (iter<maxit)) {
    cold <- ctest
    ctest <- Tbsb(cold, p)/alpha
    diff <- abs(cold-ctest)
    eter <- iter+1
  }
  res <- ctest
  return(res)
}
##--------------------------------------------------------------------
##SESTCK
## Computes Tukey's biweight objectief function (scale) corresponding 
## with the mahalanobis distances x.  

##x must be entered as a matrix!!

sestck <- function(x, start, c, k, tol){
  if (start>0){
    s <- start
  }
  else { 
    y <- abs(x)
    if (is.matrix(x)==0) {
      x <- t(as.matrix(x))
    }
    n <- nrow(x)
    p <- ncol(x)
    if ((n==1)&(p>1)){
      x <- t(x)
      n <- nrow(x)
      p <- ncol(x)
    }
    x <- apply(x, 2, sort)
    if (floor(n/2)==n/2){
      s <- (x[n/2,]+x[(n+2)/2,])/2
    }
    else {
      s <- x[(n+1)/2,]
    }
    s <- s/.6745
  }
  crit <- 2*tol
  rhoold <- mean(rhobiweight(x/s, c))-k
  while (crit>=tol){
    delta <- rhoold/mean(psibiweight(x/s, c)*(x/(s^2)))
    isqu <- 1
    okay <- 0
    while((isqu<10)&(okay!=1)){
      rhonew <- mean(rhobiweight(x/(s+delta), c))-k
      if (abs(rhonew)<abs(rhoold)){
        s <- s+delta
        okay <- 1
      } 
      else {
        delta <- delta/2 
        isqu <- isqu+1
      }
    }
    if (isqu==10){
      crit <- 0
    }
    else {
      crit <- (abs(rhoold)-abs(rhonew))/max(abs(rhonew), tol)
    }
    rhoold <- rhonew
  }
  scale <- abs(s)
  return(scale)
}
##--------------------------------------------------------------------
## Computes biweight multivariate S-estimator for location/scatter with algorithm of 
## Ruppert
##
## Input:
##
## Required arguments
## x     :a data matrix. Rows of the matrix represent observations, 
##        columns represent variables. 
## nsamp :The number of random p-subsets considered.
## bdp   :Breakdown value of the S-estimator must be 0.15, 0.25 or 0.5.
##
##
## Output (field):
##
## mean        : vector of estimates for the center of the data.
## covariance  : matrix of estimates for the scatter of the data.
## distances   : vector of robust distances versus mean & covariance.
## scale       : distance scale estimate.

##SM
##p+1 must be greater than n!!!

mySm <- function(x, nsamp=2000, bdp=0.25){
  tol <- 10^(-5)
  seed <- 0
  s <- 10^(11)
  x <- as.matrix(x)
  n <- nrow(x)
  p <- ncol(x)
  c <- Tbsc(bdp, p)
  k <- (c/6)*Tbsb(c, p)
  la <- 1
  for (i in 1:nsamp){
    ranset <- randomset(n, p+1, seed)[[1]]
    seed <- randomset(n, p+1, seed)[[2]]
    xj <- as.matrix(x[ranset,])
    mu <- apply(xj, 2, mean)
    cov <- var(xj)*(nrow(xj)-1)/nrow(xj)
    determ <- det(cov)
    if (determ>10^(-15)) {
      if (determ^(1/p)>10^(-5)) {
        cov <- determ^(-1/p)*cov
        if (i>ceiling(nsamp/5)) {
          if (i==ceiling(nsamp/2)){
            la <- 2
          }
          else {
            if (i==ceiling(nsamp*.8)) {
              la <- 4
            }
          }
          
          random <-  uniran(seed)[[2]]
          seed  <-  uniran(seed)[[1]] 
          random <- random^la
          mu <- random*mu+(1-random)*muopt
          cov <- random*cov+(1-random)*covopt
        }
        determ <- det(cov)
        cov <- determ^(-1/p)*cov
        md <- mahalanobis(x, mu, cov, inverted = FALSE, tol.inv =.Machine$double.eps)
        md <- md^(1/2)
        if (mean(rhobiweight(md/s, c))<k) {
          if (s<5*10^10) {
            s <- sestck(md, s, c, k, tol)
          }
          else {
            s <- sestck(md, 0, c, k, tol)
          }
          muopt <- mu
          covopt <- cov
          mdopt <- md
          psi <- psibiweight(md, s*c)
          u <- psi/md
          ubig <- matrix(t(u), nrow=length(u), ncol=p, byrow=FALSE)
          aux <- (ubig*x)/mean(u)
          mu <- apply(aux, 2, mean)
          xcenter <- t(t(x)-mu)
          cov <- t(ubig*xcenter)%*%xcenter
          cov <- det(cov)^(-1/p)*cov
          okay <- 0
          jj <- 1
          while ((jj<3)&(okay!=1)) {
            jj <- jj+1
            md <- mahalanobis(x, mu, cov, tol.inv =.Machine$double.eps)
            md <- md^(1/2)
            if (mean(rhobiweight(md/s, c))<k){
              muopt <- mu
              covopt  <- cov
              mdopt  <- md
              okay <- 1
              if (s<5*10^10) {
                s <- sestck(md, s, c, k, tol)}
              else {
                s <- sestck(md, 0, c, k, tol)}
            }
            else {
              mu <- (mu+muopt)/2
              cov <- (cov+covopt)/2
              cov <- determ^(-1/p)*cov
            }
          }
        }
      }
    }
  }
  covopt <- s^2*covopt
  mdopt <- mdopt/s
  res.mean <- muopt
  res.covariance <- covopt
  res.distances <- mdopt
  res.scale <- s
  return(list(center=res.mean, cov=res.covariance, res.distances, res.scale, c))
}

##end


## RIV estimator
betaIV <- function(L, V){
  b1 <- V[1,3]/V[2,3]
  b0 <- L[1]-b1*L[2]
  beta <- c(b0,b1)
  return(beta)
}
