#################################################################
#
# NOTE BOTH THE PAKAGES FEATURED HERE (NORM AND MICE)
# ARE NOT WORKING PROPERLY ON THE LATEST VERSIONS OF
# THE R INSTALLATION (VERSION 2.1)
#
# THIS FILE AND OTHERS ON THE PEAS SITE WILL
# BE BROUGHT UP TO DATE WHEN THEY ARE REINSTALLED
#
#
#
###############################################
#
# HOW TO USE THIS FILE
# This file (ex6.R) can just be sourced
# into the R program. But it is really intended to be used interactively
# 
#
######### ANY TEXT AFTER THE # CHARACTER ARE TREATED AS COMMENTS
#
# data stored in c:/GR/aprojects/peas/web/exemp6/data/ex6.Rdata
# this file is c:/GR/aprojects/peas/web/exemp3/program_code/ex3_R.html
#
##############################################
#
# data are in a data frame data
# to get the names of the variables
#
names(data)
#

######################################################################
# First a small example with just the prevalence data and gender
# binary data imputed with MICE
# If you have not done so already you will need to install
# the library from CRAN vai the packages menu
# NOTE as of july 05,on R 2.11 this was missing from the selection
# list on the menu, so instead go to the developers' web site
# http://web.inter.nl.net/users/S.van.Buuren/mi/hmtl/mice.htm
# download the latest zip file and install this
#
# These steps only need doing once
######################################################################
dprev<-data[,c(14,22,25,28,31,34,37)]
names(dprev)
#
# set up library - you need to do this every time you 
# want to use mice. 
#
# Now the imputations
# You need to specify logistic 
# regression because the codes  are numeric
# maxit has been set to 30 to  ensure convergence
# 
library(mice)
impprev<-mice(dprev,m=10, maxit=30,imputationMethod = rep("logreg",7),printFlag=T)
#
# imprev  is a special object that stores the different bits
# of the imputation efficiently. It is a list and the component
# imp gives another list that is the imputed values for each
# imputed variable for cases with missing data only
# These commands will show you it
# Before you use plot make sure that you have saved the
#   command file newfuns.R to your computer and
# have sourced it into R (file menu from console). 
# It fixes a problem with the plots
# and summary commands on the current version of R (July 05)
#
plot(impprev,F)
#
# this shows the convergence plots ideally they
# should be without trend and uncorrelated
# They are here
#
print(impprev)
names(impprev)
#
# the imp component gives the results for 
# the rows with any missing data with all
# the imputations for one variable grouped together
# examine top 10 rows for DPREV6
#
names(impprev$imp)
impprev$imp$DPREV6[1:10,]
#
# complete(,i) gives ith complete imputati
################################################
#  next bit of code gets a matrix to show how
#   the imputed data changes the prevalence
################################################
complmeans<-apply(dprev,2,mean,na.rm=T)[-1]*100
impmeans<-100*rbind(
apply(complete(impprev,1),2,mean)[-1],
apply(complete(impprev,2),2,mean)[-1],
apply(complete(impprev,3),2,mean)[-1],
apply(complete(impprev,4),2,mean)[-1],
apply(complete(impprev,5),2,mean)[-1],
apply(complete(impprev,6),2,mean)[-1],
apply(complete(impprev,7),2,mean)[-1],
apply(complete(impprev,8),2,mean)[-1],
apply(complete(impprev,9),2,mean)[-1],
apply(complete(impprev,10),2,mean)[-1])
allmeans<-rbind(complmeans,impmeans)
print(round(allmeans,2))
#
# post imputation inference for this data set
# first simple linear model predcition of prevalence
# compared to original data
#
 summary(lm(DPREV6~GENDER,dprev))
 summary(pool(glm.mids(DPREV6~GENDER,data=impprev)))

#
# This only works so far for linear models
# The post imputation functions from NORM
# allow logistic regression, though they are more work
#
 summary(glm(DPREV6~GENDER,dprev,family='binomial'))
library(norm)
coeffs<-list(10)
sds<-list(10)
for (i in 1:10){
coeffs[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,1]
sds[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,2]
}
result<-mi.inference(coeffs,sds)result
ans<-cbind(result[[1]],result[[2]],result[[8]]*100)
dimnames(ans)[[2]]<-c('mean','se','% info lost')
print(ans)


####################################################
#
# now the same small model fitted with NORM
#  This code is commented out just now
# as there is a serious bug in this package
# It has been reported by others and now
# by me too
# prevm<-as.matrix(dprev)  
#prelim.prev<-prelim.norm(prevm)
#theta.prev<-em.norm(prelim.prev)
#getparam.norm(prelim.prev,theta.prev)
# 
# converges in a few iterations OK
# now impute and plot some imputed values
#
#imputed.prev<-imp.norm(prelim.prev,theta.prev,prevm)
#imputed.prev[1:30,]
#par(mfrow=c(2,3)) 
#for (i in 2:7) hist(imputed.prev[,i],col=3,main=paste('prevalence',i-1))
#
#imputed.prev<-round(imputed.prev)
#for ( i in 2:7) print(table(imputed.prev[,i]))
#imputed.prev[,2:7][imputed.prev[,2:7]<0]<-0
#imputed.prev[,2:7][imputed.prev[,2:7]>1]<-1
#apply(imputed.prev[,2:7],2,mean)*100
#print(round(allmeans,2))
#
#
#
####################################################
#  now a bigger model first remove the CASEID
# as we don't want to use it in imputation
# also the prevalence columns because they 
# can be derived from the others
#
# have also missed off the drug use variables because 
# for some reason they will not be accepted as factors
# even though they only have 4 categories
# names(data)
######################################################
forimp<-data[,c(-1,-22,-25,-28,-31,-34,-37,-40:-45)]
summary(forimp)
#
# make a vector for methods and check it
# note that norm had to be used for all categories with >4 
# cases
#
method<-c( rep("norm",12),rep("logreg",7),'polyreg',
rep(c('norm'),12))
rbind(names(forimp),method)
#
# maxit increased to 30 when plots clearly not converged
#
impbig<-mice(forimp,imputationMethod =method,m=4,maxit=30)
#
# plot convergence
#
plot(impbig,F)
#
# now plot at observed and imputed values 
#

par(mfrow=c(2,2))
hist(as.matrix(forimp$DVOL6),main='DVOL6 observed',xlab='volume',col=3)
hist(forimp$DVAR6,main='DVAR6 observed',xlab='variety',col=3)
hist(impbig$imp$DVOL6[,4],main='DVOL6 imputed',xlab='volume',col=4)
hist(impbig$imp$DVAR6[,4],main='DVAR6 imputed',xlab='variety',col=4)
#
# we can see that imputing as normals has given 
# distributions with a range of vALUES
# This is fixed below
#
# now make a big data frame of complete data
# with the 4 imputations stacked
#
bigcomplete<-complete(impbig,'long')
names(bigcomplete)
#
# first round all values of VOL and VAR imputed as normals
# to nearest integer and set any negative values to zero
# now tidy the data for volume since it needs to be
# zero when variety is zero
# and add new columns for DPREV1 to 6
#
bigcomplete[,21:32]<-round(bigcomplete[,21:32])
bigcomplete[,21:32][bigcomplete[,21:32]<0]<-0
bigcomplete[,22][bigcomplete[,23]==0]<-0 # set vols to 0 if var 0
bigcomplete[,24][bigcomplete[,25]==0]<-0
bigcomplete[,26][bigcomplete[,27]==0]<-0
bigcomplete[,28][bigcomplete[,29]==0]<-0
bigcomplete[,30][bigcomplete[,31]==0]<-0
bigcomplete[,22][bigcomplete[,23]>0]<-1 # set vols to 1 if var> 0
bigcomplete[,24][bigcomplete[,25]>0]<-1
bigcomplete[,26][bigcomplete[,27]>0]<-1
bigcomplete[,28][bigcomplete[,29]>0]<-1
bigcomplete[,30][bigcomplete[,31]>0]<-1
bigcomplete$DPREV1<-bigcomplete$DVAR1 # prev to 1 if var>0
bigcomplete$DPREV2<-bigcomplete$DVAR2
bigcomplete$DPREV3<-bigcomplete$DVAR3
bigcomplete$DPREV4<-bigcomplete$DVAR4
bigcomplete$DPREV5<-bigcomplete$DVAR5
bigcomplete$DPREV6<-bigcomplete$DVAR6
bigcomplete[,35:40][bigcomplete[,35:40]>0]<-1
#
# and make matrix of prevalence for complete data and each iteration
#
prevfrombig<-
cbind(apply(bigcomplete[,35:40][bigcomplete$'.imp'==1,],2,mean)*100,
apply(bigcomplete[,35:40][bigcomplete$'.imp'==2,],2,mean)*100,
apply(bigcomplete[,35:40][bigcomplete$'.imp'==3,],2,mean)*100,
apply(bigcomplete[,35:40][bigcomplete$'.imp'==4,],2,mean)*100)
round(prevfrombig,2)
#
# now repeat this using norm which treats everything as normal
#

library(norm)
#
#
rngseed(100)
datam<-as.matrix(data[,-c(1,22,25,28,31,34,37)])  # exclude prevalence as this leads to collinearity
 prelim.datam<-prelim.norm(datam)
 theta.datam<-em.norm(prelim.datam)
#
# notice that this goes to 1000 iterations 
# the maximum which looks a bit dodgy
# not likely to improve with more
#
imputed.datam<-imp.norm(prelim.datam,theta.datam,datam)
imputed.datam[1:20,]
#
#   clearly some of these imputed values are nonsense
#  bug in code reported
##############################################################
#############################################################
#
# next bit does plots to put on web site
#


result<-mi.inference(mean.g,sd.g)
print("girls")
ans<-cbind(result[[1]],result[[2]],result[[8]]*100)
dimnames(ans)[[2]]<-c('mean','se','% info lost')
print(ans)


#####################################################################
# this is data brought from Stata for comparison
#################################################################
 bigimpprev<-matrix(c(.7356747,0.4410242,
0.7308226,0.4435836,
0.7839649,0.4115861,
0.7495379,0.4333292,
0.7220425,0.4480441,
0.6042052,0.4890772,
0.7324399,0.442738,
0.728512,0.4447786,
0.7835028,0.4119046,
0.7488447,0.4337278,
0.7220425,0.4480441,
0.6136784,0.486962,
0.7333641,0.4422515,
0.728512,0.4447786,
0.784658,0.4111069,
0.7520795,0.4318553,
0.7195009,0.449295,
0.6046673,0.4889785,
0.7312847,0.4433428,
0.7310536,0.4434633,
0.7832717,0.4120635,
0.7481516,0.4341248,
0.7204251,0.4488422,
0.6053604,0.4888296),,2,byrow=T)*100
bigimpprev<-matrix(bigimpprev[,1],,4)

bigimpsas<-read.table('sasout.txt')
names(bigimpsas)<-c('imp','sweep','mean','sd')

 simple<-matrix(c(1,4328,.7356747,.4410242,0,1
,2,4328,.7308226,.4435836,0,1
,3,4328,.7839649,.4115861,0,1
,4,4328,.7495379,.4333292,0,1
,5,4328,.7220425,.4480441,0,1
,6,4328,.6042052,.4890772,0,1
,1,4328,.7324399,.442738,0,1
,2,4328,.728512,.4447786,0,1
,3,4328,.7835028,.4119046,0,1
,4,4328,.7488447,.4337278,0,1
,5,4328,.7220425,.4480441,0,1
,6,4328,.6136784,.486962,0,1
,1,4328,.7333641,.4422515,0,1
,2,4328,.728512,.4447786,0,1
,3,4328,.784658,.4111069,0,1
,4,4328,.7520795,.4318553,0,1
,5,4328,.7195009,.449295,0,1
,6,4328,.6046673,.4889785,0,1
,1,4328,.7312847,.4433428,0,1
,2,4328,.7310536,.4434633,0,1
,3,4328,.7832717,.4120635,0,1
,4,4328,.7481516,.4341248,0,1
,5,4328,.7204251,.4488422,0,1
,6,4328,.6053604,.4888296,0,1),,6,byrow=T)





impdetsas<-matrix(c(1,4328,0.7479205,0.4342568,0,1.0000000
,2,4328,0.7372921,0.4401559,0,1.0000000
,3,4328,0.7878928,0.4088476,0,1.0000000
,4,4328,0.7543900,0.4304980,0,1.0000000
,5,4328,0.7435305,0.4367344,0,1.0000000
,6,4328,0.7058688,0.4557039,0,1.0000000
,1,4328,0.7495379,0.4333292,0,1.0000000
,2,4328,0.7423752,0.4373768,0,1.0000000
,3,4328,0.7881238,0.4086848,0,1.0000000
,4,4328,0.7548521,0.4302245,0,1.0000000
,5,4328,0.7430684,0.4369919,0,1.0000000
,6,4328,0.7093346,0.4541219,0,1.0000000
,1,4328,0.7442237,0.4363471,0,1.0000000
,2,4328,0.7405268,0.4383962,0,1.0000000
,3,4328,0.7881238,0.4086848,0,1.0000000
,4,4328,0.7527726,0.4314499,0,1.0000000
,5,4328,0.7432994,0.4368632,0,1.0000000
,6,4328,0.7081793,0.4546528,0,1.0000000
,1,4328,0.7483826,0.4339926,0,1.0000000
,2,4328,0.7405268,0.4383962,0,1.0000000
,3,4328,0.7878928,0.4088476,0,1.0000000
,4,4328,0.7555453,0.4298131,0,1.0000000
,5,4328,0.7430684,0.4369919,0,1.0000000
,6,4328,0.7003235,0.4581692,0,1.0000000),,6,byrow=T)
impdetsas<-impdetsas[,c(1,3,4)]
impdetsas.mean<-matrix(impdetsas[,2],,4)*100

simple<-matrix(simple[,3],6,4)
simple
for (i in 1:4) lines(1:6,simple[,i]*100,lwd=3,col=2,type='b',lty=1)




plot(c(1,6),range(c(allmeans,bigimpprev,bigimpsas$mean*100,impdetsas.mean)),type='n',xlab='sweep of survey',
ylab='prevalence of offending')
for (i in 1:11) {col=1; if (i>1) col=2;lty=1; if (i>1) lty=2;
			#lines(1:6,allmeans[i,],col=col,type='b',lty=lty)
}
legend(1,67,c('observed','SAS from summaries',
'R from summaries','Stata from summaries',
'SAS from original questions'),lty=c(c(1:4,6)),col=c(1:4,6),lwd=c(2,1,1,1))

for (i in 1:4) lines(1:6,prevfrombig[,i],col=3,lty=3,type='b')

for (i in 1:4) lines(1:6,bigimpprev[,i],col=4,lty=4,type='b')

for (i in 1:4) lines(1:6,bigimpsas$mean[bigimpsas$imp==i]*100,col=2,lty=2,type='b')

for (i in 1) lines(1:6,allmeans[i,],col=1,type='b',lty=1,lwd=2)

for (i in 1:4) lines(1:6,impdetsas.mean[,i],col=6,lty=6,type='b')