140.778 Adv Stat Comp
Code for Assignment 2


# simulate x data
sim.x <-
function(n=50,rho=0.5)
{
  x <- matrix(runif(n*3),ncol=3)
  v <- chol(matrix(c(1,rho,rho,rho,1,rho,rho,rho,1),ncol=3))
  x <- t(t(x %*% t(v))*c(5,1,20) + c(10,5,20))
  x <- as.data.frame(x)
  names(x) <- c("x1","x2","x3")
  x
}

# simulate y data with censoring
sim.data <-
function(x=sim.x(),beta=c(10,0.5,1,-0.5),sigma=2,censor=10)
{
  y <- as.numeric(as.matrix(cbind(1,x)) %*% beta) + rnorm(nrow(x),0,sigma)
  y[y > censor] <- censor

  data.frame(x,y=y)
}

  
# calculate negative log likelihood
negloglik <-
function(theta,d,censor=max(d$y))
{
  y <- d$y
  X <- as.matrix(cbind(1,d[,-ncol(d)]))
  beta <- theta[-length(theta)]
  sigma <- theta[length(theta)]
  
  loglik <- y
  o <- y < censor
  loglik[o] <- log(dnorm(y[o],X[o,] %*% beta, sigma))
  loglik[!o] <- log(1-pnorm(censor, X[!o,] %*% beta, sigma))

  -sum(loglik)
}


# LS fit ignoring censoring
fit1 <-
function(d)
{
  a <- summary(lm(y ~ . , data=d))

  b <- a$coef
  d <- a$sigma

  # calculate SEs and correlation
  V <- a$cov * d^2
  se <- sqrt(diag(V))
  rho <- V / rep(se,length(se)) / rep(se,rep(length(se),length(se)))

  list(theta=c(b[,1],d),se=se,cor=rho)
}

  
# fit using the nlm function
fit2 <-
function(d,...)
{
  start <- fit1(d)[[1]]
  o <- nlm(negloglik,start,d=d,censor=max(d$y),hessian=T,...)

  # calculate SEs and correlation
  V <- solve(o$hessian)
  se <- sqrt(diag(V))
  rho <- V / rep(se,length(se)) / rep(se,rep(length(se),length(se)))

  list(theta=o$est,se=se,cor=rho,o)
}


# fit using the EM algorithm
fit3 <-
function(d, start=fit1(d)$theta, censor=max(d$y),
         tol1=1e-8, tol2=tol1*100, maxit=1000, prnt=FALSE)
{
  y <- d$y
  o <- (y < censor)
  X <- as.matrix(cbind(1,d[,-ncol(d)]))

  H <- function(v) dnorm(v)/(1-pnorm(v))

  theta.old <- start
  p <- length(start)
  n <- length(y)

  for(i in 1:maxit) {

    # E step
    w <- y       # Expected observation
    wsq <- y^2   # Expected observation^2
    
    mu <- X[!o,] %*% theta.old[-p]
    w[!o] <- mu + theta.old[p] * H((censor - mu)/theta.old[p])

    wsq[!o] <- mu^2 + theta.old[p]^2 + theta.old[p] * (censor + mu) *
      H((censor-mu)/theta.old[p])

    # M step
    mu <- X %*% theta.old[-p]
    theta <- c(lm(w ~ -1 + X)$coef,
               sqrt((sum(wsq) - 2*sum(mu*w) + sum(mu^2))/n))

    if(prnt) {
      nll <- negloglik(theta,d,censor)
      print(c(i,nll,theta))
    }

    # check convergence
    if(all(abs(theta - theta.old) < tol1 * (abs(theta.old)+tol2))) break;

    theta.old <- theta

  }

  theta
}  


kbroman at jhsph.edu
http://www.biostat.wisc.edu/~kbroman

Last modified: Wed Apr 15 23:10:12 2009