################################################################# # # NOTE BOTH THE PAKAGES FEATURED HERE (NORM AND MICE) # ARE NOT WORKING PROPERLY ON THE LATEST VERSIONS OF # THE R INSTALLATION (VERSION 2.1) # # THIS FILE AND OTHERS ON THE PEAS SITE WILL # BE BROUGHT UP TO DATE WHEN THEY ARE REINSTALLED # # # ############################################### # # HOW TO USE THIS FILE # This file (ex6.R) can just be sourced # into the R program. But it is really intended to be used interactively # # ######### ANY TEXT AFTER THE # CHARACTER ARE TREATED AS COMMENTS # # data stored in c:/GR/aprojects/peas/web/exemp6/data/ex6.Rdata # this file is c:/GR/aprojects/peas/web/exemp3/program_code/ex3_R.html # ############################################## # # data are in a data frame data # to get the names of the variables # names(data) # ###################################################################### # First a small example with just the prevalence data and gender # binary data imputed with MICE # If you have not done so already you will need to install # the library from CRAN vai the packages menu # NOTE as of july 05,on R 2.11 this was missing from the selection # list on the menu, so instead go to the developers' web site # http://web.inter.nl.net/users/S.van.Buuren/mi/hmtl/mice.htm # download the latest zip file and install this # # These steps only need doing once ###################################################################### dprev<-data[,c(14,22,25,28,31,34,37)] names(dprev) # # set up library - you need to do this every time you # want to use mice. # # Now the imputations # You need to specify logistic # regression because the codes are numeric # maxit has been set to 30 to ensure convergence # library(mice) impprev<-mice(dprev,m=10, maxit=30,imputationMethod = rep("logreg",7),printFlag=T) # # imprev is a special object that stores the different bits # of the imputation efficiently. It is a list and the component # imp gives another list that is the imputed values for each # imputed variable for cases with missing data only # These commands will show you it # Before you use plot make sure that you have saved the # command file newfuns.R to your computer and # have sourced it into R (file menu from console). # It fixes a problem with the plots # and summary commands on the current version of R (July 05) # plot(impprev,F) # # this shows the convergence plots ideally they # should be without trend and uncorrelated # They are here # print(impprev) names(impprev) # # the imp component gives the results for # the rows with any missing data with all # the imputations for one variable grouped together # examine top 10 rows for DPREV6 # names(impprev$imp) impprev$imp$DPREV6[1:10,] # # complete(,i) gives ith complete imputati ################################################ # next bit of code gets a matrix to show how # the imputed data changes the prevalence ################################################ complmeans<-apply(dprev,2,mean,na.rm=T)[-1]*100 impmeans<-100*rbind( apply(complete(impprev,1),2,mean)[-1], apply(complete(impprev,2),2,mean)[-1], apply(complete(impprev,3),2,mean)[-1], apply(complete(impprev,4),2,mean)[-1], apply(complete(impprev,5),2,mean)[-1], apply(complete(impprev,6),2,mean)[-1], apply(complete(impprev,7),2,mean)[-1], apply(complete(impprev,8),2,mean)[-1], apply(complete(impprev,9),2,mean)[-1], apply(complete(impprev,10),2,mean)[-1]) allmeans<-rbind(complmeans,impmeans) print(round(allmeans,2)) # # post imputation inference for this data set # first simple linear model predcition of prevalence # compared to original data # summary(lm(DPREV6~GENDER,dprev)) summary(pool(glm.mids(DPREV6~GENDER,data=impprev))) # # This only works so far for linear models # The post imputation functions from NORM # allow logistic regression, though they are more work # summary(glm(DPREV6~GENDER,dprev,family='binomial')) library(norm) coeffs<-list(10) sds<-list(10) for (i in 1:10){ coeffs[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,1] sds[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,2] } result<-mi.inference(coeffs,sds)result ans<-cbind(result[[1]],result[[2]],result[[8]]*100) dimnames(ans)[[2]]<-c('mean','se','% info lost') print(ans) #################################################### # # now the same small model fitted with NORM # This code is commented out just now # as there is a serious bug in this package # It has been reported by others and now # by me too # prevm<-as.matrix(dprev) #prelim.prev<-prelim.norm(prevm) #theta.prev<-em.norm(prelim.prev) #getparam.norm(prelim.prev,theta.prev) # # converges in a few iterations OK # now impute and plot some imputed values # #imputed.prev<-imp.norm(prelim.prev,theta.prev,prevm) #imputed.prev[1:30,] #par(mfrow=c(2,3)) #for (i in 2:7) hist(imputed.prev[,i],col=3,main=paste('prevalence',i-1)) # #imputed.prev<-round(imputed.prev) #for ( i in 2:7) print(table(imputed.prev[,i])) #imputed.prev[,2:7][imputed.prev[,2:7]<0]<-0 #imputed.prev[,2:7][imputed.prev[,2:7]>1]<-1 #apply(imputed.prev[,2:7],2,mean)*100 #print(round(allmeans,2)) # # # #################################################### # now a bigger model first remove the CASEID # as we don't want to use it in imputation # also the prevalence columns because they # can be derived from the others # # have also missed off the drug use variables because # for some reason they will not be accepted as factors # even though they only have 4 categories # names(data) ###################################################### forimp<-data[,c(-1,-22,-25,-28,-31,-34,-37,-40:-45)] summary(forimp) # # make a vector for methods and check it # note that norm had to be used for all categories with >4 # cases # method<-c( rep("norm",12),rep("logreg",7),'polyreg', rep(c('norm'),12)) rbind(names(forimp),method) # # maxit increased to 30 when plots clearly not converged # impbig<-mice(forimp,imputationMethod =method,m=4,maxit=30) # # plot convergence # plot(impbig,F) # # now plot at observed and imputed values # par(mfrow=c(2,2)) hist(as.matrix(forimp$DVOL6),main='DVOL6 observed',xlab='volume',col=3) hist(forimp$DVAR6,main='DVAR6 observed',xlab='variety',col=3) hist(impbig$imp$DVOL6[,4],main='DVOL6 imputed',xlab='volume',col=4) hist(impbig$imp$DVAR6[,4],main='DVAR6 imputed',xlab='variety',col=4) # # we can see that imputing as normals has given # distributions with a range of vALUES # This is fixed below # # now make a big data frame of complete data # with the 4 imputations stacked # bigcomplete<-complete(impbig,'long') names(bigcomplete) # # first round all values of VOL and VAR imputed as normals # to nearest integer and set any negative values to zero # now tidy the data for volume since it needs to be # zero when variety is zero # and add new columns for DPREV1 to 6 # bigcomplete[,21:32]<-round(bigcomplete[,21:32]) bigcomplete[,21:32][bigcomplete[,21:32]<0]<-0 bigcomplete[,22][bigcomplete[,23]==0]<-0 # set vols to 0 if var 0 bigcomplete[,24][bigcomplete[,25]==0]<-0 bigcomplete[,26][bigcomplete[,27]==0]<-0 bigcomplete[,28][bigcomplete[,29]==0]<-0 bigcomplete[,30][bigcomplete[,31]==0]<-0 bigcomplete[,22][bigcomplete[,23]>0]<-1 # set vols to 1 if var> 0 bigcomplete[,24][bigcomplete[,25]>0]<-1 bigcomplete[,26][bigcomplete[,27]>0]<-1 bigcomplete[,28][bigcomplete[,29]>0]<-1 bigcomplete[,30][bigcomplete[,31]>0]<-1 bigcomplete$DPREV1<-bigcomplete$DVAR1 # prev to 1 if var>0 bigcomplete$DPREV2<-bigcomplete$DVAR2 bigcomplete$DPREV3<-bigcomplete$DVAR3 bigcomplete$DPREV4<-bigcomplete$DVAR4 bigcomplete$DPREV5<-bigcomplete$DVAR5 bigcomplete$DPREV6<-bigcomplete$DVAR6 bigcomplete[,35:40][bigcomplete[,35:40]>0]<-1 # # and make matrix of prevalence for complete data and each iteration # prevfrombig<- cbind(apply(bigcomplete[,35:40][bigcomplete$'.imp'==1,],2,mean)*100, apply(bigcomplete[,35:40][bigcomplete$'.imp'==2,],2,mean)*100, apply(bigcomplete[,35:40][bigcomplete$'.imp'==3,],2,mean)*100, apply(bigcomplete[,35:40][bigcomplete$'.imp'==4,],2,mean)*100) round(prevfrombig,2) # # now repeat this using norm which treats everything as normal # library(norm) # # rngseed(100) datam<-as.matrix(data[,-c(1,22,25,28,31,34,37)]) # exclude prevalence as this leads to collinearity prelim.datam<-prelim.norm(datam) theta.datam<-em.norm(prelim.datam) # # notice that this goes to 1000 iterations # the maximum which looks a bit dodgy # not likely to improve with more # imputed.datam<-imp.norm(prelim.datam,theta.datam,datam) imputed.datam[1:20,] # # clearly some of these imputed values are nonsense # bug in code reported ############################################################## ############################################################# # # next bit does plots to put on web site # result<-mi.inference(mean.g,sd.g) print("girls") ans<-cbind(result[[1]],result[[2]],result[[8]]*100) dimnames(ans)[[2]]<-c('mean','se','% info lost') print(ans) ##################################################################### # this is data brought from Stata for comparison ################################################################# bigimpprev<-matrix(c(.7356747,0.4410242, 0.7308226,0.4435836, 0.7839649,0.4115861, 0.7495379,0.4333292, 0.7220425,0.4480441, 0.6042052,0.4890772, 0.7324399,0.442738, 0.728512,0.4447786, 0.7835028,0.4119046, 0.7488447,0.4337278, 0.7220425,0.4480441, 0.6136784,0.486962, 0.7333641,0.4422515, 0.728512,0.4447786, 0.784658,0.4111069, 0.7520795,0.4318553, 0.7195009,0.449295, 0.6046673,0.4889785, 0.7312847,0.4433428, 0.7310536,0.4434633, 0.7832717,0.4120635, 0.7481516,0.4341248, 0.7204251,0.4488422, 0.6053604,0.4888296),,2,byrow=T)*100 bigimpprev<-matrix(bigimpprev[,1],,4) bigimpsas<-read.table('sasout.txt') names(bigimpsas)<-c('imp','sweep','mean','sd') simple<-matrix(c(1,4328,.7356747,.4410242,0,1 ,2,4328,.7308226,.4435836,0,1 ,3,4328,.7839649,.4115861,0,1 ,4,4328,.7495379,.4333292,0,1 ,5,4328,.7220425,.4480441,0,1 ,6,4328,.6042052,.4890772,0,1 ,1,4328,.7324399,.442738,0,1 ,2,4328,.728512,.4447786,0,1 ,3,4328,.7835028,.4119046,0,1 ,4,4328,.7488447,.4337278,0,1 ,5,4328,.7220425,.4480441,0,1 ,6,4328,.6136784,.486962,0,1 ,1,4328,.7333641,.4422515,0,1 ,2,4328,.728512,.4447786,0,1 ,3,4328,.784658,.4111069,0,1 ,4,4328,.7520795,.4318553,0,1 ,5,4328,.7195009,.449295,0,1 ,6,4328,.6046673,.4889785,0,1 ,1,4328,.7312847,.4433428,0,1 ,2,4328,.7310536,.4434633,0,1 ,3,4328,.7832717,.4120635,0,1 ,4,4328,.7481516,.4341248,0,1 ,5,4328,.7204251,.4488422,0,1 ,6,4328,.6053604,.4888296,0,1),,6,byrow=T) impdetsas<-matrix(c(1,4328,0.7479205,0.4342568,0,1.0000000 ,2,4328,0.7372921,0.4401559,0,1.0000000 ,3,4328,0.7878928,0.4088476,0,1.0000000 ,4,4328,0.7543900,0.4304980,0,1.0000000 ,5,4328,0.7435305,0.4367344,0,1.0000000 ,6,4328,0.7058688,0.4557039,0,1.0000000 ,1,4328,0.7495379,0.4333292,0,1.0000000 ,2,4328,0.7423752,0.4373768,0,1.0000000 ,3,4328,0.7881238,0.4086848,0,1.0000000 ,4,4328,0.7548521,0.4302245,0,1.0000000 ,5,4328,0.7430684,0.4369919,0,1.0000000 ,6,4328,0.7093346,0.4541219,0,1.0000000 ,1,4328,0.7442237,0.4363471,0,1.0000000 ,2,4328,0.7405268,0.4383962,0,1.0000000 ,3,4328,0.7881238,0.4086848,0,1.0000000 ,4,4328,0.7527726,0.4314499,0,1.0000000 ,5,4328,0.7432994,0.4368632,0,1.0000000 ,6,4328,0.7081793,0.4546528,0,1.0000000 ,1,4328,0.7483826,0.4339926,0,1.0000000 ,2,4328,0.7405268,0.4383962,0,1.0000000 ,3,4328,0.7878928,0.4088476,0,1.0000000 ,4,4328,0.7555453,0.4298131,0,1.0000000 ,5,4328,0.7430684,0.4369919,0,1.0000000 ,6,4328,0.7003235,0.4581692,0,1.0000000),,6,byrow=T) impdetsas<-impdetsas[,c(1,3,4)] impdetsas.mean<-matrix(impdetsas[,2],,4)*100 simple<-matrix(simple[,3],6,4) simple for (i in 1:4) lines(1:6,simple[,i]*100,lwd=3,col=2,type='b',lty=1) plot(c(1,6),range(c(allmeans,bigimpprev,bigimpsas$mean*100,impdetsas.mean)),type='n',xlab='sweep of survey', ylab='prevalence of offending') for (i in 1:11) {col=1; if (i>1) col=2;lty=1; if (i>1) lty=2; #lines(1:6,allmeans[i,],col=col,type='b',lty=lty) } legend(1,67,c('observed','SAS from summaries', 'R from summaries','Stata from summaries', 'SAS from original questions'),lty=c(c(1:4,6)),col=c(1:4,6),lwd=c(2,1,1,1)) for (i in 1:4) lines(1:6,prevfrombig[,i],col=3,lty=3,type='b') for (i in 1:4) lines(1:6,bigimpprev[,i],col=4,lty=4,type='b') for (i in 1:4) lines(1:6,bigimpsas$mean[bigimpsas$imp==i]*100,col=2,lty=2,type='b') for (i in 1) lines(1:6,allmeans[i,],col=1,type='b',lty=1,lwd=2) for (i in 1:4) lines(1:6,impdetsas.mean[,i],col=6,lty=6,type='b')