##############################################
#
# HOW TO USE THIS FILE
# This is a  text version o (ex3.R) which can just be sourced
# 
#
#########  ANY TEXT AFTER THE # CHARACTER ARE TREATED AS COMMENTS
#
# data stored in c:/GR/aprojects/peas/web/exemp3/data/ex3.Rdata
# this file is c:/GR/aprojects/peas/web/exemp3/program_code/ex3.R
#
##############################################
#
# data are in a data frame health
# to get the names of the variables
#
names(health)
#
# the cigarette smoking data is in a factor called health$CIGST1
# to get a table of this variable do
#
table(health$CIGST1)
#
# the next bit of code makes a new 1/0 variable for smokers
# and stores it in the data frame (health)
#
health$SMOKER<-(health$CIGST1=='Current cigarette smoker')
health$SMOKER[health$CIGST1=='Dont know']<-NA
health$SMOKER[health$CIGST1=='Refused/Not answered']<-NA
health$SMOKER[health$CIGST1=='schedule not obtained']<-NA
#
# to check that it has worked correctly
# you will see that we have set the result to the R missing value (NA)
# when question was not answered
#
table(health$CIGST1,health$SMOKER,exclude=NULL)
#
# now it is time to try using the survey package
#
# If you have not already done so you will need to 
# download the survey package 
# (>>pakages menu >> install package from cran>> select a mirror near you
#               and pick the survey pakage from the list)
# then to give R access to the survey functions do
#
library(survey)
#
#
# The first step is to create a SURVEY DESIGN OBJECT that 
# will contain both the data and the information about the design
#
# This design has WEIGHTS (WEIGHTA), 
#  it is clustered so it has PSUs (PSU)
#  and it is stratified (implicitly) by deprivation category within region
#  and the PSUs have been grouped into implicit strata (REGSTRAT)
# with just 2 or three PSUs per stratum
# There are TWO kinds of survey design
# 
#  1. Using simple methods , usually for surveys WITHOUT POST-STRATIFICATION
#  2. Using replicate weights, usually for surveys with POST-STARTIFICATION
#
#  The Scottish Health Survey STRATIFICATION and POST-STRATIFICATION  so  
#   to allow for all features we should use functions of the second kind
#  But first we set up a design to allow for clustering weighting and stratification
#  that could be analysed by the simple methods
#
health.des <- svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health)
#
# see the html help file for the function svydesign for more explanation
# the ~ sign refers to it being one of the columns of the data
# the R object health.des now contains all the information about the design
# including the data
# To get details of the design
summary(health.des)
#
# now we can use the svymean command to get the smoking rate and its standard error
#
svymean(~SMOKER,health.des,deff=T)
#
# annoyingly this gives a missing result because of the missing values
# but we can get over this by using a command to restrict the survey design
# to the non-missing cases (!=not is.na identifies missing data)
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER)))

# this gives a rate of 0.332 (33.2%) and its standard error of 0.006 (0.6%)
# we can compare this with the corresponding unweighted mean 
# which again has to be restricted to the unweighted cases
#
mean(health$SMOKER[!is.na(health$SMOKER)])
#
# this gives the higher rate of 34.79%
#
# The code below plots the illustration in the Web page for this exemplar
# It is restricted to adults under 65 to make it comparable with 1995 data
# To understand it fully you would need to learn more R from one of the help
# guides
#
rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$AGE<65])

wtmean<-print( svymean( ~SMOKER,subset(health.des,!is.na(SMOKER) & AGE<65)) )

par(mfrow=c(1,1))

plot(c(31,40),c(0,1),type='n',xlab='smoking rate (%)',ylab='',yaxt='n')
legend(31,1,c('weighted rate 95% C. I.','unweighted rate','1995 weighted rate'),
pch=c(15,16,17),col=c(1,2,3),bty='n',cex=c(1.5,1.5,1.5),lty=c(1,0,0))
points(rawmean*100,0.2,pch=16,cex=2,col=2)
lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.2,0.2))
points(wtmean[1]*100,0.2,pch=15,cex=2)
points(35,.2,pch=17,cex=2,col=3)
text(39,.2,'All')
#
# now by sex
#
rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$SEX=='male'& health$AGE<65])
wtmean<-print( svymean( ~SMOKER,subset(health.des,(!is.na(SMOKER) & SEX=='male'& AGE<65)  ) ) )
points(rawmean*100,0.4,pch=16,cex=2,col=2)
lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.4,0.4))
points(wtmean[1]*100,0.4,pch=15,cex=2)
points(34,.4,pch=17,cex=2,col=3)
text(39,.4,'male')

rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$SEX=='female'& health$AGE<65])
wtmean<-print( svymean( ~SMOKER,subset(health.des,(!is.na(SMOKER) & SEX=='female'& AGE<65)  ) ) )
points(rawmean*100,0.6,pch=16,cex=2,col=2)
lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.6,0.6))
points(wtmean[1]*100,0.6,pch=15,cex=2)
points(36,.6,pch=17,cex=2,col=3)
text(39,.6,'Female')
#
# We can explore how different features affect the design effect by 
# setting up different designs (see web page for this exemplar)
#
svymean(~SMOKER,subset(svydesign(id=~IDNO, weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
svymean(~SMOKER,subset(svydesign(id=~PSU,
weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
svymean(~SMOKER,subset(svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)

# the next stage is to convert it into a design with replicate weights
# we use the default options which produces, by default, jacknife samples
# the BRR method gives almost identical results
# see html help for as.svrepdesign 
#
health.rep.des.BRR <- as.svrepdesign(health.des,type='BRR',small='merge',large='split')
summary(health.rep.des.BRR)
svymean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)))

health.rep.des <- as.svrepdesign(health.des)
summary(health.rep.des)
svymean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)))
#
# an alternative method is to use BRR
#
health.rep.des.BRR <- as.svrepdesign(health.des,type='BRR',large='merge')
summary(health.rep.des.BRR)
svymean(~SMOKER,subset(health.rep.des.BRR,!is.na(SMOKER)))
#
# now we can post-stratify this by region and then by
# age and sex
# This will not affect the weights, since this is already included here
# but it MIGHT improve the precisio of estimates
#
tab.region <- xtabs(GROSSWT~REGION,data=health)
health.rep.des<-postStratify(health.rep.des,~REGION,tab.region)
svymean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)

tab.agesex <- xtabs(GROSSWT~SEX+AGEG,data=health)
health.rep.des<-postStratify(health.rep.des,~SEX+AGEG,tab.agesex)
svymean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)

#
# In fact you will see that it does little to improve things
#
health.rep.des.BRR<-postStratify(health.rep.des.BRR,~REGION,tab.region)
svymean(~SMOKER,subset(health.rep.des.BRR,!is.na(SMOKER)),deff=T)

health.rep.des.BRR<-postStratify(health.rep.des,~SEX+AGEG,tab.agesex)
svymean(~SMOKER,subset(health.rep.des.BRR,!is.na(SMOKER)),deff=T)
#
# and the same is true for BRR methods

# now we look at the precision of estimates for regions and Health Boards
# 
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lothian & Fife'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lothian'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Fife'))
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lothian & Fife'),family='binomial'))

# 
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lanarkshire,Ayrshire & Arran'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lanarkshire'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Ayrshire & Arran'))
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lanarkshire,Ayrshire & Arran'),family='binomial'))

# and also for BRR method
# precision of estimates for regions and Health Boards
# 
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lothian & Fife'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lothian'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Fife'))
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lothian & Fife'),family='binomial'))

# 
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lanarkshire,Ayrshire & Arran'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lanarkshire'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Ayrshire & Arran'))
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lanarkshire,Ayrshire & Arran'),family='binomial'))

#
# now some logistic regression models to explore influences on SMOKING
#
summary(svyglm(SMOKER~SEX+AGEG+REGION,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+SC+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+CARSTG5+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
#
# and now some barplots to illustrate
# the importance of different factors
# in the models above  6 per page
#
par(mfrow=c(2,3),oma=c(0,0,0,0))
# 
# Social Class
#
tt<-svyby(~SMOKER,~SC,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c('NK',1,2,3.1,3.2,4,5,'NA'),main='by social class',ylab='% cigarette smokers',ylim=c(0,60))
#
# area deprivation
#
tt<-svyby(~SMOKER,~CARSTG5,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c(1:5),main='by 5 Carstairs groups',ylab='% cigarette smokers',ylim=c(0,50))
#
# number of adults in household- first need to group large numbers together
# and this needs to be done by adding a variable to the design object
#
health.des<-update(health.des,NOFAD2=NOFAD*(NOFAD<=5)+5*(NOFAD>5))
tt<-svyby(~SMOKER,~NOFAD2,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c(1:4,'5+'),main='by number of adults',ylab='% cigarette smokers',ylim=c(0,50))
#
# regions
#
tt<-svyby(~SMOKER,~REGION,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c('H&I','G&T','L&F','BD&G','Gla','LA&A','FVA'),main='by regions',ylab='% cigarette smokers',ylim=c(0,50))

#
# age for men
#
tt<-svyby(~SMOKER,~AGEG,svymean,design=subset(health.des,!is.na(SMOKER) & SEX=='male'))
barplot(100*tt[,2],main='age groups men',ylab='% cigarette smokers',names=17+5*(0:11),ylim=c(0,50))
#
# age groups women
#
tt<-svyby(~SMOKER,~AGEG,svymean,design=subset(health.des,!is.na(SMOKER)& SEX=='female'))
barplot(100*tt[,2],main='age groups women',names=17+5*(0:11),ylab='% cigarette smokers',ylim=c(0,50))