############################################## # # HOW TO USE THIS FILE # This is a text version o (ex3.R) which can just be sourced #  # ######### ANY TEXT AFTER THE # CHARACTER ARE TREATED AS COMMENTS # # data stored in c:/GR/aprojects/peas/web/exemp3/data/ex3.Rdata # this file is c:/GR/aprojects/peas/web/exemp3/program_code/ex3.R # ############################################## # # data are in a data frame health # to get the names of the variables # names(health) # # the cigarette smoking data is in a factor called health$CIGST1 # to get a table of this variable do # table(health$CIGST1) # # the next bit of code makes a new 1/0 variable for smokers # and stores it in the data frame (health) # health$SMOKER<-(health$CIGST1=='Current cigarette smoker') health$SMOKER[health$CIGST1=='Dont know']<-NA health$SMOKER[health$CIGST1=='Refused/Not answered']<-NA health$SMOKER[health$CIGST1=='schedule not obtained']<-NA # # to check that it has worked correctly # you will see that we have set the result to the R missing value (NA) # when question was not answered # table(health$CIGST1,health$SMOKER,exclude=NULL) # # now it is time to try using the survey package # # If you have not already done so you will need to # download the survey package # (>>pakages menu >> install package from cran>> select a mirror near you # and pick the survey pakage from the list) # then to give R access to the survey functions do # library(survey) # # # The first step is to create a SURVEY DESIGN OBJECT that # will contain both the data and the information about the design # # This design has WEIGHTS (WEIGHTA),  # it is clustered so it has PSUs (PSU) # and it is stratified (implicitly) by deprivation category within region # and the PSUs have been grouped into implicit strata (REGSTRAT) # with just 2 or three PSUs per stratum # There are TWO kinds of survey design # # 1. Using simple methods , usually for surveys WITHOUT POST-STRATIFICATION # 2. Using replicate weights, usually for surveys with POST-STARTIFICATION # # The Scottish Health Survey STRATIFICATION and POST-STRATIFICATION so # to allow for all features we should use functions of the second kind # But first we set up a design to allow for clustering weighting and stratification # that could be analysed by the simple methods # health.des <- svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health) # # see the html help file for the function svydesign for more explanation # the ~ sign refers to it being one of the columns of the data # the R object health.des now contains all the information about the design # including the data # To get details of the design summary(health.des) # # now we can use the svymean command to get the smoking rate and its standard error # svymean(~SMOKER,health.des,deff=T) # # annoyingly this gives a missing result because of the missing values # but we can get over this by using a command to restrict the survey design # to the non-missing cases (!=not is.na identifies missing data) # svymean(~SMOKER,subset(health.des,!is.na(SMOKER))) # this gives a rate of 0.332 (33.2%) and its standard error of 0.006 (0.6%) # we can compare this with the corresponding unweighted mean  # which again has to be restricted to the unweighted cases # mean(health$SMOKER[!is.na(health$SMOKER)]) # # this gives the higher rate of 34.79% # # The code below plots the illustration in the Web page for this exemplar # It is restricted to adults under 65 to make it comparable with 1995 data # To understand it fully you would need to learn more R from one of the help # guides # rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$AGE<65]) wtmean<-print( svymean( ~SMOKER,subset(health.des,!is.na(SMOKER) & AGE<65)) ) par(mfrow=c(1,1)) plot(c(31,40),c(0,1),type='n',xlab='smoking rate (%)',ylab='',yaxt='n') legend(31,1,c('weighted rate 95% C. I.','unweighted rate','1995 weighted rate'), pch=c(15,16,17),col=c(1,2,3),bty='n',cex=c(1.5,1.5,1.5),lty=c(1,0,0)) points(rawmean*100,0.2,pch=16,cex=2,col=2) lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.2,0.2)) points(wtmean[1]*100,0.2,pch=15,cex=2) points(35,.2,pch=17,cex=2,col=3) text(39,.2,'All') # # now by sex # rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$SEX=='male'& health$AGE<65]) wtmean<-print( svymean( ~SMOKER,subset(health.des,(!is.na(SMOKER) & SEX=='male'& AGE<65) ) ) ) points(rawmean*100,0.4,pch=16,cex=2,col=2) lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.4,0.4)) points(wtmean[1]*100,0.4,pch=15,cex=2) points(34,.4,pch=17,cex=2,col=3) text(39,.4,'male') rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$SEX=='female'& health$AGE<65]) wtmean<-print( svymean( ~SMOKER,subset(health.des,(!is.na(SMOKER) & SEX=='female'& AGE<65) ) ) ) points(rawmean*100,0.6,pch=16,cex=2,col=2) lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.6,0.6)) points(wtmean[1]*100,0.6,pch=15,cex=2) points(36,.6,pch=17,cex=2,col=3) text(39,.6,'Female') # # We can explore how different features affect the design effect by  # setting up different designs (see web page for this exemplar) # svymean(~SMOKER,subset(svydesign(id=~IDNO, weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T) svymean(~SMOKER,subset(svydesign(id=~PSU, weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T) svymean(~SMOKER,subset(svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T) # the next stage is to convert it into a design with replicate weights # we use the default options which produces, by default, jacknife samples # the BRR method gives almost identical results # see html help for as.svrepdesign  # health.rep.des.BRR <- as.svrepdesign(health.des,type='BRR',small='merge',large='split') summary(health.rep.des.BRR) svymean(~SMOKER,subset(health.rep.des,!is.na(SMOKER))) health.rep.des <- as.svrepdesign(health.des) summary(health.rep.des) svymean(~SMOKER,subset(health.rep.des,!is.na(SMOKER))) # # an alternative method is to use BRR # health.rep.des.BRR <- as.svrepdesign(health.des,type='BRR',large='merge') summary(health.rep.des.BRR) svymean(~SMOKER,subset(health.rep.des.BRR,!is.na(SMOKER))) # # now we can post-stratify this by region and then by # age and sex # This will not affect the weights, since this is already included here # but it MIGHT improve the precisio of estimates # tab.region <- xtabs(GROSSWT~REGION,data=health) health.rep.des<-postStratify(health.rep.des,~REGION,tab.region) svymean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T) tab.agesex <- xtabs(GROSSWT~SEX+AGEG,data=health) health.rep.des<-postStratify(health.rep.des,~SEX+AGEG,tab.agesex) svymean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T) # # In fact you will see that it does little to improve things # health.rep.des.BRR<-postStratify(health.rep.des.BRR,~REGION,tab.region) svymean(~SMOKER,subset(health.rep.des.BRR,!is.na(SMOKER)),deff=T) health.rep.des.BRR<-postStratify(health.rep.des,~SEX+AGEG,tab.agesex) svymean(~SMOKER,subset(health.rep.des.BRR,!is.na(SMOKER)),deff=T) # # and the same is true for BRR methods # now we look at the precision of estimates for regions and Health Boards #  svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lothian & Fife')) svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lothian')) svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Fife')) summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lothian & Fife'),family='binomial')) #  svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lanarkshire,Ayrshire & Arran')) svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lanarkshire')) svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Ayrshire & Arran')) summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lanarkshire,Ayrshire & Arran'),family='binomial')) # and also for BRR method # precision of estimates for regions and Health Boards # svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lothian & Fife')) svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lothian')) svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Fife')) summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lothian & Fife'),family='binomial')) # svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lanarkshire,Ayrshire & Arran')) svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lanarkshire')) svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Ayrshire & Arran')) summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lanarkshire,Ayrshire & Arran'),family='binomial')) # # now some logistic regression models to explore influences on SMOKING # summary(svyglm(SMOKER~SEX+AGEG+REGION,design=subset(health.des,!is.na(SMOKER)),family='binomial')) summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+SC+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial')) summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+CARSTG5+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial')) # # and now some barplots to illustrate # the importance of different factors # in the models above 6 per page # par(mfrow=c(2,3),oma=c(0,0,0,0)) #  # Social Class # tt<-svyby(~SMOKER,~SC,svymean,design=subset(health.des,!is.na(SMOKER))) barplot(100*tt[,2],names=c('NK',1,2,3.1,3.2,4,5,'NA'),main='by social class',ylab='% cigarette smokers',ylim=c(0,60)) # # area deprivation # tt<-svyby(~SMOKER,~CARSTG5,svymean,design=subset(health.des,!is.na(SMOKER))) barplot(100*tt[,2],names=c(1:5),main='by 5 Carstairs groups',ylab='% cigarette smokers',ylim=c(0,50)) # # number of adults in household- first need to group large numbers together # and this needs to be done by adding a variable to the design object # health.des<-update(health.des,NOFAD2=NOFAD*(NOFAD<=5)+5*(NOFAD>5)) tt<-svyby(~SMOKER,~NOFAD2,svymean,design=subset(health.des,!is.na(SMOKER))) barplot(100*tt[,2],names=c(1:4,'5+'),main='by number of adults',ylab='% cigarette smokers',ylim=c(0,50)) # # regions # tt<-svyby(~SMOKER,~REGION,svymean,design=subset(health.des,!is.na(SMOKER))) barplot(100*tt[,2],names=c('H&I','G&T','L&F','BD&G','Gla','LA&A','FVA'),main='by regions',ylab='% cigarette smokers',ylim=c(0,50)) # # age for men # tt<-svyby(~SMOKER,~AGEG,svymean,design=subset(health.des,!is.na(SMOKER) & SEX=='male')) barplot(100*tt[,2],main='age groups men',ylab='% cigarette smokers',names=17+5*(0:11),ylim=c(0,50)) # # age groups women # tt<-svyby(~SMOKER,~AGEG,svymean,design=subset(health.des,!is.na(SMOKER)& SEX=='female')) barplot(100*tt[,2],main='age groups women',names=17+5*(0:11),ylab='% cigarette smokers',ylim=c(0,50))