Exemplar 6 R output

names(data)
dprev<-data[,c(14,22,25,28,31,34,37)]
names(dprev)

[1] "GENDER" "DPREV1" "DPREV2" "DPREV3" "DPREV4" "DPREV5"
[7] "DPREV6"


library(mice)
impprev<-mice(dprev,m=10, maxit=30,imputationMethod = rep("logreg",7),printFlag=T)


 iter imp variable
  1   1  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6
  1   2  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6
  1   3  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6
  1   4  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6
  1   5  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6

#
# More lines not shown for 30 iterations and 10 imputations at each
# Before you use plot make sure that you have saved the
#   command file newfuns.R to your computer and
# have sourced it into R (file menu from console). 
# It fixes a problem with the plots
# and summary commands on the current version of R (July 05)
##########################################

plot(impprev,F)

#
# this shows the convergence plots, ideally they
# should be without trend and uncorrelated
# They are here - just one shwon - hit new

# lines to see each one 


#
#  back to top 
#

print(impprev)

Multiply imputed data set
Call:
mice(data = dprev, m = 10, imputationMethod = rep("logreg", 7), 
    maxit = 30, printFlag = T)
Number of multiple imputations:  10
Missing cells per column:
GENDER DPREV1 DPREV2 DPREV3 DPREV4 DPREV5 DPREV6 
     0    429    336    251    331    576    934 
Imputation methods:
  GENDER   DPREV1   DPREV2   DPREV3   DPREV4   DPREV5 
"logreg" "logreg" "logreg" "logreg" "logreg" "logreg" 
  DPREV6 
"logreg" 
VisitSequence:
DPREV1 DPREV2 DPREV3 DPREV4 DPREV5 DPREV6 
     2      3      4      5      6      7 
PredictorMatrix:
       GENDER DPREV1 DPREV2 DPREV3 DPREV4 DPREV5 DPREV6
GENDER      0      0      0      0      0      0      0
DPREV1      1      0      1      1      1      1      1
DPREV2      1      1      0      1      1      1      1
DPREV3      1      1      1      0      1      1      1
DPREV4      1      1      1      1      0      1      1
DPREV5      1      1      1      1      1      0      1
DPREV6      1      1      1      1      1      1      0
Random generator seed value:  0

names(impprev)

 [1] "call"             "data"            
 [3] "m"                "nmis"            
 [5] "imp"              "imputationMethod"
 [7] "predictorMatrix"  "visitSequence"   
 [9] "seed"             "iteration"       
[11] "lastSeedValue"    "chainMean"       
[13] "chainVar"         "pad"

#
# the imp component gives the results for 
# the rows with any missing data with all
# the imputations for one variable grouped together
# examine top 10 rows for DPREV6
#

names(impprev$imp)

[1] "GENDER" "DPREV1" "DPREV2" "DPREV3" "DPREV4" "DPREV5"
[7] "DPREV6"

impprev$imp$DPREV6[1:10,]

   1 2 3 4 5 6 7 8 9 10
1  0 1 1 1 1 1 1 1 1  1
5  1 1 1 1 1 1 1 0 0  1
11 1 0 1 1 0 1 1 1 1  0
18 1 0 1 1 1 0 0 1 1  1
20 1 1 0 1 0 1 1 1 0  1
29 0 1 1 1 1 0 0 1 1  1
32 0 1 1 0 1 1 1 0 1  1
41 0 1 1 1 1 1 1 1 1  1
47 1 1 1 1 1 1 0 1 1  1
50 1 0 0 0 0 0 0 1 0  0

#
# complete(,i) gives ith complete imputati
################################################
#  next bit of code gets a matrix to show how
#   the imputed data changes the prevalence
################################################

complmeans<-apply(dprev,2,mean,na.rm=T)[-1]*100
impmeans<-100*rbind(
apply(complete(impprev,1),2,mean)[-1],
apply(complete(impprev,2),2,mean)[-1],
apply(complete(impprev,3),2,mean)[-1],
apply(complete(impprev,4),2,mean)[-1],
apply(complete(impprev,5),2,mean)[-1],
apply(complete(impprev,6),2,mean)[-1],
apply(complete(impprev,7),2,mean)[-1],
apply(complete(impprev,8),2,mean)[-1],
apply(complete(impprev,9),2,mean)[-1],
apply(complete(impprev,10),2,mean)[-1])
allmeans<-rbind(complmeans,impmeans)
print(round(allmeans,2))

           DPREV1 DPREV2 DPREV3 DPREV4 DPREV5 DPREV6
complmeans  73.07  72.62  78.00  74.33  71.06  59.02
            73.48  73.01  78.10  74.77  72.34  61.32
            73.41  72.92  78.35  74.95  71.72  60.21
            73.15  72.83  78.30  75.02  71.95  60.84
            73.24  73.22  78.19  74.86  71.93  59.77
            73.17  72.85  78.12  74.88  72.37  59.33
            72.99  72.85  78.28  74.75  71.95  60.88
            73.06  72.94  78.37  75.07  72.18  60.67
            73.50  72.97  78.26  75.16  71.90  60.33
            73.17  72.83  78.40  74.77  71.77  61.34
            73.15  73.11  78.47  75.09  72.09  60.74

# YOu can see that the imputed data are giving very 
# similar mean values to the complete data on the top line
#

#
#  back to top 
#
# post imputation inference for this data set
# first simple linear model predcition of prevalence
# compared to original data
#

 summary(lm(DPREV6~GENDER,dprev))

Call:
lm(formula = DPREV6 ~ GENDER, data = dprev)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.6341 -0.5503  0.3659  0.4497  0.4497 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.71780    0.02702  26.565  < 2e-16 ***
GENDER      -0.08375    0.01685  -4.971 6.99e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 0.4902 on 3392 degrees of freedom
Multiple R-Squared: 0.007232,   Adjusted R-squared: 0.00694 
F-statistic: 24.71 on 1 and 3392 DF,  p-value: 6.992e-07

#
# Now we use the post-imputation commands glm.mids and pool
# from the mice library to compare the results from the imputed data
# We can see that the estimates are similar with the intercept significantly
# higher in the imputed data. fmi (below) stands for the fraction of missing 
# information.
#

 summary(pool(glm.mids(DPREV6~GENDER,impprev)))

 mids(DPREV6~GENDER,data=impprev)))
                    est         se         t       df     Pr(>|t|)
(Intercept)  0.73440373 0.02743557 26.768303 112.9948 0.000000e+00
GENDER      -0.08629958 0.01667142 -5.176498 190.1192 5.723487e-07
                 lo 95       hi 95 missing       fmi
(Intercept)  0.6800489  0.78875858      NA 0.2772072
GENDER      -0.1191843 -0.05341486       0 0.2115440

#
# This only works so far for linear models
# The post imputation functions from NORM
# allow logistic regression, though they are more work
#

 summary(glm(DPREV6~GENDER,dprev,family='binomial'))

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.89739    0.11378   7.887 3.09e-15 ***
GENDER      -0.34774    0.07029  -4.947 7.52e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

(Dispersion parameter for binomial family taken to be 1)

library(norm)
coeffs<-list(10)
sds<-list(10)
for (i in 1:10){
coeffs[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,1]
sds[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,2]
}
result<-mi.inference(coeffs,sds)result
ans<-cbind(result[[1]],result[[2]],result[[8]]*100)
dimnames(ans)[[2]]<-c('mean','se','% info lost')
print(ans)

                  mean         se % info lost
(Intercept)  0.9732241 0.11810667    30.21033
GENDER      -0.3623463 0.07061113    22.52466

#  again results very similar for oods ratio

#  back to top 
#

####################################################
#
# now the same small model fitted with NORM
# as there is a serious bug in this package
# It has been reported by others and now
#
# converges in a few iterations OK
# now impute and plot some imputed values

#  back to top 
####################################################
#  now a bigger model first remove the CASEID
# as we don't want to use it in imputation
# also the prevalence columns because they 
# can be derived from the others
#
# have also missed off the drug use variables because 
# for some reason they will not be accepted as factors
# even though they only have 4 categories
# names(data)
######################################################

forimp<-data[,c(-1,-22,-25,-28,-31,-34,-37,-40:-45)]
summary(forimp)

#
# make a vector for methods and check it<
# note that norm had to be used for all categories with >4 
# cases
#

method<-c( rep("norm",12),rep("logreg",7),'polyreg',
rep(c('norm'),12))
rbind(names(forimp),method)

  "SASMKY80" "SBSMKY80" "SCSMKY80" "SDSMKY80"
method "norm"     "norm"     "norm"     "norm"    
       [,5]       [,6]       [,7]       [,8]      
       "SESMKY80" "SFSMKY80" "RAALCY80" "RBALCY80"
method "norm"     "norm"     "norm"     "norm"    
       [,9]       [,10]      [,11]      [,12]     
       "RCALCY80" "RDALCY80" "REALCY80" "RFALCY80"
method "norm"     "norm"     "norm"     "norm"    
       [,13]    [,14]    [,15]      [,16]     [,17]    
       "GENDER" "ETHGP"  "HZRESPRE" "SZINDEP" "SZABS04"
method "logreg" "logreg" "logreg"   "logreg"  "logreg" 
       [,18]      [,19]     [,20]     [,21]   [,22]  
       "HZEVREFI" "YZLEAVE" "SECTOR"  "DVAR1" "DVOL1"
method "logreg"   "logreg"  "polyreg" "norm"  "norm" 
       [,23]   [,24]   [,25]   [,26]   [,27]   [,28]  
       "DVAR2" "DVOL2" "DVAR3" "DVOL3" "DVAR4" "DVOL4"
method "norm"  "norm"  "norm"  "norm"  "norm"  "norm" 
       [,29]   [,30]   [,31]   [,32]  
       "DVAR5" "DVOL5" "DVAR6" "DVOL6"
method "norm"  "norm"  "norm"  "norm"

impbig<-mice(forimp,imputationMethod =method,m=4,maxit=30)

#
# plot convergence earlier plots suggested 30 iterations would be needed
# This is just one example

plot(impbig,F)


#  back to top 
#
# now plot at observed and imputed values 
#

par(mfrow=c(2,2))
hist(as.matrix(forimp$DVOL6),main='DVOL6 observed',xlab='volume',col=3)
hist(forimp$DVAR6,main='DVAR6 observed',xlab='variety',col=3)
hist(impbig$imp$DVOL6[,4],main='DVOL6 imputed',xlab='volume',col=4)
hist(impbig$imp$DVAR6[,4],main='DVAR6 imputed',xlab='variety',col=4)

#
# we can see that imputing as normals has given 
# distributions with negative values for items that
# ought to be positive. This is fixed below.
# Now make a big data frame of complete data
# with the 4 imputations stacked
#

bigcomplete<-complete(impbig,'long')
names(bigcomplete)
bigcomplete[,21:32]<-round(bigcomplete[,21:32])

 lines missed out here - see programme code

bigcomplete[,35:40][bigcomplete[,35:40]>0]<-1
apply(bigcomplete[,35:40][bigcomplete$'.imp'==4,],2,mean)*100)
round(prevfrombig,2)

      [,1]  [,2]  [,3]  [,4]
DPREV1 73.75 73.61 73.71 73.73
DPREV2 73.43 72.80 73.36 73.29
DPREV3 78.77 78.74 78.58 78.60
DPREV4 75.35 75.39 75.49 75.25
DPREV5 73.01 72.90 72.80 72.69
DPREV6 62.55 61.99 62.50 62.75

#
# these results a little higher than the smaller imputation
# but not by much
#

#  back to top 
#
# now repeat this using norm which treats everything as normal
# There is a bug in this program so not working now
# ex6.R gives code in case it does start working
#