#################################################################
#
# NOTE BOTH THE PAKAGES FEATURED HERE (NORM AND MICE)
# ARE NOT WORKING PROPERLY ON THE LATEST VERSIONS OF
# THE R INSTALLATION (VERSION 2.1)
#
# THIS FILE AND OTHERS ON THE PEAS SITE WILL
# BE BROUGHT UP TO DATE WHEN THEY ARE REINSTALLED
################################################
# HOW TO USE THIS FILE
# This is an HTML version of the R script file ex2.R
# It is intended to show you the code and to allow links.
# NOT to use interactively to run the code
# Pasting the  into R might work, but it might not
# to view the ouptut from these commands in html go to R results page
red stuff (R commands)
#
#
#########  ANY TEXT AFTER THE # CHARACTER (here shown black)
#             ARE TREATED AS COMMENTS BY R
#
#
##############################################
#
# data are in a data frame shs
# to get the names of the variables
#

Links in this page

Using MICE for a small imputation problem
Post imputation procedures for this
Using norm for a smallimputation problem
MICE imputation procedures for larger data set
Post imputation procedures for larger data set
Trying norm for big data set

names(data)
######################################################################
# First a small example with just the prevalence data and gender
# binary data imputed with MICE
# If you have not done so already you will need to install
# the library from CRAN vai the packages menu
# NOTE as of july 05,on R 2.11 this was missing from the selection
# list on the menu, so instead go to the developers' web site
# http://web.inter.nl.net/users/S.van.Buuren/mi/hmtl/mice.htm
# download the latest zip file and install this
#
# These steps only need doing once
######################################################################
dprev<-data[,c(14,22,25,28,31,34,37)]
names(dprev)
#
# set up library - you need to do this every time you 
# want to use mice. 

#
#  Now the imputations
# You need to specify logistic 
# regression because the codes  are numeric
# maxit has been set to 30 to ensure convergence
#

library(mice)
impprev<-mice(dprev,m=10, maxit=30,imputationMethod = rep("logreg",7),printFlag=T)
#  back to top uparrow
#
# imprev  is a special object that stores the different bits
# of the imputation efficiently. It is a list and the component
# imp gives another list that is the imputed values for each
# imputed variable for cases with missing data only
# These commands will show you it
#

# Before you use plot make sure that you have saved the
#   command file newfuns.R to your computer and
# have sourced it into R (file menu from console). 
# It fixes a problem with the plots
# and summary commands on the current version of R (July 05)
##########################################
plot(impprev,F)
#
# this shows the convergence plots, ideally they
# should be without trend and uncorrelated
# They are here
#
print(impprev)
names(impprev)
#
# the imp component gives the results for 
# the rows with any missing data with all
# the imputations for one variable grouped together
# examine top 10 rows for DPREV6
#
names(impprev$imp)
impprev$imp$DPREV6[1:10,]
#
# complete(,i) gives ith complete imputati
################################################
#  next bit of code gets a matrix to show how
#   the imputed data changes the prevalence
################################################
complmeans<-apply(dprev,2,mean,na.rm=T)[-1]*100
impmeans<-100*rbind(
apply(complete(impprev,1),2,mean)[-1],
apply(complete(impprev,2),2,mean)[-1],
apply(complete(impprev,3),2,mean)[-1],
apply(complete(impprev,4),2,mean)[-1],
apply(complete(impprev,5),2,mean)[-1],
apply(complete(impprev,6),2,mean)[-1],
apply(complete(impprev,7),2,mean)[-1],
apply(complete(impprev,8),2,mean)[-1],
apply(complete(impprev,9),2,mean)[-1],
apply(complete(impprev,10),2,mean)[-1])
allmeans<-rbind(complmeans,impmeans)
print(round(allmeans,2))
 
 
#
#  back to top uparrow
#
# post imputation inference for this data set
# first simple linear model predcition of prevalence
# compared to original data
#
 summary(glm(DPREV6~GENDER,dprev))
 summary(pool(glm.mids(DPREV6~GENDER,impprev)))
#
# This only works so far for linear models
# The post imputation functions from NORM
# allow logistic regression, though they are more work
#
library(norm)
coeffs<-list(10)
sds<-list(10)
for (i in 1:10){
coeffs[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,1]
sds[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,2]
}
result<-mi.inference(coeffs,sds)result
ans<-cbind(result[[1]],result[[2]],result[[8]]*100)
dimnames(ans)[[2]]<-c('mean','se','% info lost')
print(ans)
#  back to top uparrow
#
####################################################
#
# now the same small model fitted with NORM # This code commented out - it is in the original file # as there is a serious bug in this package
# It has been reported by others and now
# by me too # back to top uparrow #################################################### # now a bigger model first remove the CASEID # as we don't want to use it in imputation # also the prevalence columns because they # can be derived from the others # # have also missed off the drug use variables because # for some reason they will not be accepted as factors # even though they only have 4 categories # names(data) ######################################################
forimp<-data[,c(-1,-22,-25,-28,-31,-34,-37,-40:-45)]
summary(forimp)
#
# make a vector for methods and check it<
# note that norm had to be used for all categories with >4 
# cases
#
method<-c( rep("norm",12),rep("logreg",7),'polyreg',
rep(c('norm'),12))
rbind(names(forimp),method)
#
# maxit increased to 30 when plots clearly not converged
#
impbig<-mice(forimp,imputationMethod =method,m=4,maxit=30)
#
# plot convergence #
plot(impbig,F)

#  back to top uparrow
#
# now plot at observed and imputed values
#
par(mfrow=c(2,2))
hist(as.matrix(forimp$DVOL6),main='DVOL6 observed',xlab='volume',col=3)
hist(forimp$DVAR6,main='DVAR6 observed',xlab='variety',col=3)
hist(impbig$imp$DVOL6[,4],main='DVOL6 imputed',xlab='volume',col=4)
hist(impbig$imp$DVAR6[,4],main='DVAR6 imputed',xlab='variety',col=4)
#
# we can see that imputing as normals has given 
# distributions with a range of vALUES
# This is fixed below
# now make a big data frame of complete data
# with the 4 imputations stacked
#
bigcomplete<-complete(impbig,'long')
names(bigcomplete)
# 
#
# now logistic regression analyses for survey data ren=moving missing categories
#
bigcomplete[,21:32]<-round(bigcomplete[,21:32])
bigcomplete[,21:32][bigcomplete[,21:32]<0]<-0
bigcomplete[,22][bigcomplete[,23]==0]<-0 # set vols to 0 if var 0
bigcomplete[,24][bigcomplete[,25]==0]<-0
bigcomplete[,26][bigcomplete[,27]==0]<-0
bigcomplete[,28][bigcomplete[,29]==0]<-0
bigcomplete[,30][bigcomplete[,31]==0]<-0
bigcomplete[,22][bigcomplete[,23]>0]<-1 # set vols to 1 if var> 0
bigcomplete[,24][bigcomplete[,25]>0]<-1
bigcomplete[,26][bigcomplete[,27]>0]<-1
bigcomplete[,28][bigcomplete[,29]>0]<-1
bigcomplete[,30][bigcomplete[,31]>0]<-1
bigcomplete$DPREV1<-bigcomplete$DVAR1 # prev to 1 if var>0
bigcomplete$DPREV2<-bigcomplete$DVAR2
bigcomplete$DPREV3<-bigcomplete$DVAR3
bigcomplete$DPREV4<-bigcomplete$DVAR4
bigcomplete$DPREV5<-bigcomplete$DVAR5
bigcomplete$DPREV6<-bigcomplete$DVAR6
bigcomplete[,35:40][bigcomplete[,35:40]>0]<-1
#
# and make matrix of prevalence for complete data and each iteration
#
prevfrombig<-
cbind(apply(bigcomplete[,35:40][bigcomplete$'.imp'==1,],2,mean)*100,
apply(bigcomplete[,35:40][bigcomplete$'.imp'==2,],2,mean)*100,
apply(bigcomplete[,35:40][bigcomplete$'.imp'==3,],2,mean)*100,
apply(bigcomplete[,35:40][bigcomplete$'.imp'==4,],2,mean)*100)
round(prevfrombig,2)

#  back to top uparrow
#
#
# now repeat this using norm which treats everything as normal
# as mentioned above there is a nasty bug in this code

library(norm)
rngseed(100)
datam<-as.matrix(data[,-c(1,22,25,28,31,34,37)])  # exclude prevalence as this leads to collinearity
 prelim.datam<-prelim.norm(datam)
 theta.datam<-em.norm(prelim.datam)
#
# notice that this goes to 1000 iterations 
# the maximum which looks a bit dodgy
# not likely to improve with more
#
imputed.datam<-imp.norm(prelim.datam,theta.datam,datam)
imputed.datam[1:20,]
#
#   clearly some of these imputed values are nonsense
#