#################################################################
#
# NOTE BOTH THE PAKAGES FEATURED HERE (NORM AND MICE)
# ARE NOT WORKING PROPERLY ON THE LATEST VERSIONS OF
# THE R INSTALLATION (VERSION 2.1)
#
# THIS FILE AND OTHERS ON THE PEAS SITE WILL
# BE BROUGHT UP TO DATE WHEN THEY ARE REINSTALLED
#
################################################
# HOW TO USE THIS FILE 
# This is an HTML file to show you what R output looks like 
# Comments relate to interpretation of output 
# for comments on how torun code see the html R source code
red stuff is (R commands)
blue is output
# Black is comment

Links in this page

Using MICE for a small imputation problem
Post imputation procedures for this
Using norm for a smallimputation problem
MICE imputation procedures for larger data set
Post imputation procedures for larger data set
Trying norm for big data set

names(data)
dprev<-data[,c(14,22,25,28,31,34,37)]
names(dprev)
  
[1] "GENDER" "DPREV1" "DPREV2" "DPREV3" "DPREV4" "DPREV5"
[7] "DPREV6"

library(mice)
impprev<-mice(dprev,m=10, maxit=30,imputationMethod = rep("logreg",7),printFlag=T)

 iter imp variable
  1   1  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6
  1   2  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6
  1   3  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6
  1   4  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6
  1   5  DPREV1  DPREV2  DPREV3  DPREV4  DPREV5  DPREV6
  
#
# More lines not shown for 30 iterations and 10 imputations at each
# Before you use plot make sure that you have saved the
#   command file newfuns.R to your computer and
# have sourced it into R (file menu from console). 
# It fixes a problem with the plots
# and summary commands on the current version of R (July 05)
##########################################
plot(impprev,F)
#
# this shows the convergence plots, ideally they
# should be without trend and uncorrelated
# They are here - just one shwon - hit new

# lines to see each one 
# # back to top uparrow #
print(impprev)
Multiply imputed data set
Call:
mice(data = dprev, m = 10, imputationMethod = rep("logreg", 7), 
    maxit = 30, printFlag = T)
Number of multiple imputations:  10
Missing cells per column:
GENDER DPREV1 DPREV2 DPREV3 DPREV4 DPREV5 DPREV6 
     0    429    336    251    331    576    934 
Imputation methods:
  GENDER   DPREV1   DPREV2   DPREV3   DPREV4   DPREV5 
"logreg" "logreg" "logreg" "logreg" "logreg" "logreg" 
  DPREV6 
"logreg" 
VisitSequence:
DPREV1 DPREV2 DPREV3 DPREV4 DPREV5 DPREV6 
     2      3      4      5      6      7 
PredictorMatrix:
       GENDER DPREV1 DPREV2 DPREV3 DPREV4 DPREV5 DPREV6
GENDER      0      0      0      0      0      0      0
DPREV1      1      0      1      1      1      1      1
DPREV2      1      1      0      1      1      1      1
DPREV3      1      1      1      0      1      1      1
DPREV4      1      1      1      1      0      1      1
DPREV5      1      1      1      1      1      0      1
DPREV6      1      1      1      1      1      1      0
Random generator seed value:  0 
names(impprev)
 [1] "call"             "data"            
 [3] "m"                "nmis"            
 [5] "imp"              "imputationMethod"
 [7] "predictorMatrix"  "visitSequence"   
 [9] "seed"             "iteration"       
[11] "lastSeedValue"    "chainMean"       
[13] "chainVar"         "pad" 
#
# the imp component gives the results for 
# the rows with any missing data with all
# the imputations for one variable grouped together
# examine top 10 rows for DPREV6
#
names(impprev$imp)
[1] "GENDER" "DPREV1" "DPREV2" "DPREV3" "DPREV4" "DPREV5"
[7] "DPREV6"
impprev$imp$DPREV6[1:10,]
   1 2 3 4 5 6 7 8 9 10
1  0 1 1 1 1 1 1 1 1  1
5  1 1 1 1 1 1 1 0 0  1
11 1 0 1 1 0 1 1 1 1  0
18 1 0 1 1 1 0 0 1 1  1
20 1 1 0 1 0 1 1 1 0  1
29 0 1 1 1 1 0 0 1 1  1
32 0 1 1 0 1 1 1 0 1  1
41 0 1 1 1 1 1 1 1 1  1
47 1 1 1 1 1 1 0 1 1  1
50 1 0 0 0 0 0 0 1 0  0
#
# complete(,i) gives ith complete imputati
################################################
#  next bit of code gets a matrix to show how
#   the imputed data changes the prevalence
################################################
complmeans<-apply(dprev,2,mean,na.rm=T)[-1]*100
impmeans<-100*rbind(
apply(complete(impprev,1),2,mean)[-1],
apply(complete(impprev,2),2,mean)[-1],
apply(complete(impprev,3),2,mean)[-1],
apply(complete(impprev,4),2,mean)[-1],
apply(complete(impprev,5),2,mean)[-1],
apply(complete(impprev,6),2,mean)[-1],
apply(complete(impprev,7),2,mean)[-1],
apply(complete(impprev,8),2,mean)[-1],
apply(complete(impprev,9),2,mean)[-1],
apply(complete(impprev,10),2,mean)[-1])
allmeans<-rbind(complmeans,impmeans)
print(round(allmeans,2))
           DPREV1 DPREV2 DPREV3 DPREV4 DPREV5 DPREV6
complmeans  73.07  72.62  78.00  74.33  71.06  59.02
            73.48  73.01  78.10  74.77  72.34  61.32
            73.41  72.92  78.35  74.95  71.72  60.21
            73.15  72.83  78.30  75.02  71.95  60.84
            73.24  73.22  78.19  74.86  71.93  59.77
            73.17  72.85  78.12  74.88  72.37  59.33
            72.99  72.85  78.28  74.75  71.95  60.88
            73.06  72.94  78.37  75.07  72.18  60.67
            73.50  72.97  78.26  75.16  71.90  60.33
            73.17  72.83  78.40  74.77  71.77  61.34
            73.15  73.11  78.47  75.09  72.09  60.74

# YOu can see that the imputed data are giving very 
# similar mean values to the complete data on the top line
#
 
#
#  back to top uparrow
#
# post imputation inference for this data set
# first simple linear model predcition of prevalence
# compared to original data
#
 summary(lm(DPREV6~GENDER,dprev))
 
Call:
lm(formula = DPREV6 ~ GENDER, data = dprev)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.6341 -0.5503  0.3659  0.4497  0.4497 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.71780    0.02702  26.565  < 2e-16 ***
GENDER      -0.08375    0.01685  -4.971 6.99e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 0.4902 on 3392 degrees of freedom
Multiple R-Squared: 0.007232,   Adjusted R-squared: 0.00694 
F-statistic: 24.71 on 1 and 3392 DF,  p-value: 6.992e-07 
#
# Now we use the post-imputation commands glm.mids and pool
# from the mice library to compare the results from the imputed data
# We can see that the estimates are similar with the intercept significantly
# higher in the imputed data. fmi (below) stands for the fraction of missing 
# information.
#
 summary(pool(glm.mids(DPREV6~GENDER,impprev)))
 
 mids(DPREV6~GENDER,data=impprev)))
                    est         se         t       df     Pr(>|t|)
(Intercept)  0.73440373 0.02743557 26.768303 112.9948 0.000000e+00
GENDER      -0.08629958 0.01667142 -5.176498 190.1192 5.723487e-07
                 lo 95       hi 95 missing       fmi
(Intercept)  0.6800489  0.78875858      NA 0.2772072
GENDER      -0.1191843 -0.05341486       0 0.2115440
#
# This only works so far for linear models
# The post imputation functions from NORM
# allow logistic regression, though they are more work
#
 summary(glm(DPREV6~GENDER,dprev,family='binomial'))
Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.89739    0.11378   7.887 3.09e-15 ***
GENDER      -0.34774    0.07029  -4.947 7.52e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

(Dispersion parameter for binomial family taken to be 1)
library(norm)
coeffs<-list(10)
sds<-list(10)
for (i in 1:10){
coeffs[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,1]
sds[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,2]
}
result<-mi.inference(coeffs,sds)result
ans<-cbind(result[[1]],result[[2]],result[[8]]*100)
dimnames(ans)[[2]]<-c('mean','se','% info lost')
print(ans)
                  mean         se % info lost
(Intercept)  0.9732241 0.11810667    30.21033
GENDER      -0.3623463 0.07061113    22.52466
#  again results very similar for oods ratio

#  back to top uparrow
#
####################################################
#
# now the same small model fitted with NORM # as there is a serious bug in this package
# It has been reported by others and now
#
# converges in a few iterations OK
# now impute and plot some imputed values
# back to top uparrow #################################################### # now a bigger model first remove the CASEID # as we don't want to use it in imputation # also the prevalence columns because they # can be derived from the others # # have also missed off the drug use variables because # for some reason they will not be accepted as factors # even though they only have 4 categories # names(data) ######################################################
forimp<-data[,c(-1,-22,-25,-28,-31,-34,-37,-40:-45)]
summary(forimp)
#
# make a vector for methods and check it<
# note that norm had to be used for all categories with >4 
# cases
#
method<-c( rep("norm",12),rep("logreg",7),'polyreg',
rep(c('norm'),12))
rbind(names(forimp),method)
  "SASMKY80" "SBSMKY80" "SCSMKY80" "SDSMKY80"
method "norm"     "norm"     "norm"     "norm"    
       [,5]       [,6]       [,7]       [,8]      
       "SESMKY80" "SFSMKY80" "RAALCY80" "RBALCY80"
method "norm"     "norm"     "norm"     "norm"    
       [,9]       [,10]      [,11]      [,12]     
       "RCALCY80" "RDALCY80" "REALCY80" "RFALCY80"
method "norm"     "norm"     "norm"     "norm"    
       [,13]    [,14]    [,15]      [,16]     [,17]    
       "GENDER" "ETHGP"  "HZRESPRE" "SZINDEP" "SZABS04"
method "logreg" "logreg" "logreg"   "logreg"  "logreg" 
       [,18]      [,19]     [,20]     [,21]   [,22]  
       "HZEVREFI" "YZLEAVE" "SECTOR"  "DVAR1" "DVOL1"
method "logreg"   "logreg"  "polyreg" "norm"  "norm" 
       [,23]   [,24]   [,25]   [,26]   [,27]   [,28]  
       "DVAR2" "DVOL2" "DVAR3" "DVOL3" "DVAR4" "DVOL4"
method "norm"  "norm"  "norm"  "norm"  "norm"  "norm" 
       [,29]   [,30]   [,31]   [,32]  
       "DVAR5" "DVOL5" "DVAR6" "DVOL6"
method "norm"  "norm"  "norm"  "norm" 
 
impbig<-mice(forimp,imputationMethod =method,m=4,maxit=30)
#
# plot convergence earlier plots suggested 30 iterations would be needed # This is just one example
plot(impbig,F)

#  back to top uparrow
#
# now plot at observed and imputed values
#
par(mfrow=c(2,2))
hist(as.matrix(forimp$DVOL6),main='DVOL6 observed',xlab='volume',col=3)
hist(forimp$DVAR6,main='DVAR6 observed',xlab='variety',col=3)
hist(impbig$imp$DVOL6[,4],main='DVOL6 imputed',xlab='volume',col=4)
hist(impbig$imp$DVAR6[,4],main='DVAR6 imputed',xlab='variety',col=4)

#
# we can see that imputing as normals has given 
# distributions with negative values for items that
# ought to be positive. This is fixed below.
# Now make a big data frame of complete data
# with the 4 imputations stacked
#
bigcomplete<-complete(impbig,'long')
names(bigcomplete)
bigcomplete[,21:32]<-round(bigcomplete[,21:32])
 lines missed out here - see programme code
bigcomplete[,35:40][bigcomplete[,35:40]>0]<-1
apply(bigcomplete[,35:40][bigcomplete$'.imp'==4,],2,mean)*100)
round(prevfrombig,2)
      [,1]  [,2]  [,3]  [,4]
DPREV1 73.75 73.61 73.71 73.73
DPREV2 73.43 72.80 73.36 73.29
DPREV3 78.77 78.74 78.58 78.60
DPREV4 75.35 75.39 75.49 75.25
DPREV5 73.01 72.90 72.80 72.69
DPREV6 62.55 61.99 62.50 62.75
#
# these results a little higher than the smaller imputation
# but not by much
#

#  back to top uparrow
#
# now repeat this using norm which treats everything as normal
# There is a bug in this program so not working now
# ex6.R gives code in case it does start working
#