##############################################
#
# HOW TO USE THIS FILE
# A text version of this file (ex3.R) is also available which can just be sourced
# into the R program
# This file is intended for you to use interactively by pasting into R
# Comments are in black - red bits are commands that can be pasted into R
#
######### ANY TEXT AFTER
THE # CHARACTER ARE TREATED AS COMMENTS
#
# data stored in
c:/GR/aprojects/peas/web/exemp3/data/ex3.Rdata
# this file is
c:/GR/aprojects/peas/web/exemp3/program_code/ex3_R.html
#
##############################################
#
#
data are in a data frame health
# to get the names of the variables
#
names(health)
[1] "AGE" "SEX" "REGION" "HBOARD" "CARSTAIR" "CARSTG5" [7] "CIGST1" "CIGST2" "ARCHPT" "NOFAD" "SC" "WEIGHTA" [13] "GROSSWT" "IDNO" "PSUNO" "PSU" "MONTH" "QUARTER" [19] "STRATUM" "NIN" "REGSTRAT" "AGEG"
#
# the cigarette smoking data is in a factor called health$CIGST1
# to get a table of this variable do
#
table(health$CIGST1)
Refused/Not answered Dont know
14 16
schedule not obtained not applicable
3 3
Never smoked cigarettes at all Used to smoke cigarettes occasionally
3711 269
Used to smoke cigarettes regularly Current cigarette smoker
1895 3136
# the next bit of code makes a new 1/0 variable for smokers
# and stores it in the
data frame (health)
#
health$SMOKER<- (health$CIGST1=='Current cigarette smoker')
health$SMOKER[health$CIGST1=='Dont know']<-NA
health$SMOKER[health$CIGST1=='Refused/Not answered']<-NA
health$SMOKER[health$CIGST1=='schedule
not obtained']<-NA
#
# to check that it has worked correctly
# you will see that we have set the result to the R missing value (NA)
# when question was not answered
#
table(health$CIGST1,health$SMOKER,exclude=NULL)
FALSE TRUE <NA> Refused/Not answered 0 0 14 Dont know 0 0 16 schedule not obtained 0 0 3 not applicable 3 0 0 Never smoked cigarettes at all 3711 0 0 Used to smoke cigarettes occasionally 269 0 0 Used to smoke cigarettes regularly 1895 0 0 Current cigarette smoker 0 3136 0
#
# now it is time to try using the survey
package
# to give R access to the survey functions do
#
library(survey)
#
#
The first step is to create a SURVEY DESIGN OBJECT that
# will contain both the data and the information about the design
#
# This design has WEIGHTS (WEIGHTA),
# it is clustered so it has PSUs (PSU)
# and it is stratified (implicitly) by deprivation category within region
# and the PSUs have been grouped into implicit strata (REGSTRAT)
#
with just 2 or three PSUs per stratum
#
There are TWO kinds of survey design
#
# 1. Using simple methods , usually for
surveys WITHOUT POST-STRATIFICATION
# 2. Using replicate weights, usually for
surveys with POST-STARTIFICATION
#
# The Scottish Health Survey STRATIFICATION
and POST-STRATIFICATION so
# to allow for all features we should use functions of the second kind
# But first we set up a design to allow for clustering weighting and stratification
# that could be analysed by the simple methods
#
health.des <- svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health)
#
# see the html help file for the function svydesign for more explanation
# the ~ sign refers to it being one of the columns of the data
# the R object health.des now contains all the information about the design
# including the data
# To get details of the design
summary(health.des)
#
Stratified 1 - level Cluster Sampling design
With ( 312 ) clusters.
svydesign(id = ~PSU, strata = ~REGSTRAT, weights = ~WEIGHTA,
data = health)
Probabilities:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1260 0.7825 1.1090 1.3920 1.7710 15.3900
Stratum sizes:
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 201 202
obs 48 45 58 65 52 71 68 65 49 54 69 54 59 56 89 57 57
design.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2
actual.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2
lines omitted
719 obs 51 design.PSU 2 actual.PSU 2 Data variables: [1] "AGE" "SEX" "REGION" "HBOARD" "CARSTAIR" "CARSTG5" [7] "CIGST1" "CIGST2" "ARCHPT" "NOFAD" "SC" "WEIGHTA" [13] "GROSSWT" "IDNO" "PSUNO" "PSU" "MONTH" "QUARTER" [19] "STRATUM" "NIN" "REGSTRAT" "AGEG" "SMOKER"
# now we can use the svymean command to get the smoking rate and its standard error
#
svymean(~SMOKER,health.des,deff=T)
mean SE DEff SMOKER NA NA NA
#
# annoyingly this gives a missing result because of the missing values
# but we can get over this by using a command to restrict the survey design
# to the non-missing cases (!=not is.na identifies missing data)
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER)))
mean SE SMOKER 0.33312 0.006
# this gives a rate of 0.332 (33.2%) and its standard error of 0.006 (0.6%)
# we can compare this with the corresponding unweighted mean
# which again has to be restricted to the unweighted cases
#
mean(health$SMOKER[!is.na(health$SMOKER)])
[1] 0.3479033
#
# this gives the higher rate of 34.79%
#
#
# We can explore how different features affect the design effect by
# setting up different designs (see web page for this exemplar)
#
svymean(~SMOKER,subset(svydesign(id=~IDNO, weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0057008 1.3187
svymean(~SMOKER,subset(svydesign(id=~PSU,
weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0074961 2.28
svymean(~SMOKER,subset(svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0060102 1.4657
# the next stage is to convert it into a design with replicate weights
# we use the default options which produces, by default, jacknife samples
# the BRR method gives almost identical results
# see html help for as.svrepdesign
#
health.rep.des <- as.svrepdesign(health.des)
svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)))
mean SE SMOKER 0.33312 0.006
#
# now we can post-stratify this by region and then by
# age and sex
# This will not affect the weights, since this is already included here
# but it MIGHT improve the precisio of estimates
#
tab.region <- xtabs(GROSSWT~REGION,data=health)
health.rep.des<-postStratify(health.rep.des,~REGION,tab.region)
svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0060317 1.4762
tab.agesex <- xtabs(GROSSWT~SEX+AGEG,data=health)
health.rep.des<-postStratify(health.rep.des,~SEX+AGEG,tab.agesex)
svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0059543 1.4386
#
# In fact you will see that it does little to improve things
#
# now we look at the precision of estimates for regions and Health Boards
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lothian & Fife'))
mean SE SMOKER 0.32138 0.0114
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lothian'))
mean SE SMOKER 0.30387 0.0152
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Fife'))
mean SE SMOKER 0.35144 0.018
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lothian & Fife'),family='binomial'))
Call:
svyglm(formula = SMOKER ~ HBOARD, design = subset(health.des,
REGION == "Lothian & Fife"), family = "binomial")
Survey design: subset.survey.design(health.des, REGION == "Lothian & Fife")
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.61273 0.07894 -7.762 1.38e-14 ***
HBOARDLothian -0.21622 0.10769 -2.008 0.0448 *
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for binomial family taken to be 0.9959916)
Number of Fisher Scoring iterations: 4
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lanarkshire,Ayrshire & Arran'))
mean SE SMOKER 0.34259 0.0131
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lanarkshire'))
mean SE SMOKER 0.34435 0.0187
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Ayrshire & Arran'))
mean SE SMOKER 0.34052 0.0249
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lanarkshire,Ayrshire & Arran'),family='binomial'))
Call:
svyglm(formula = SMOKER ~ HBOARD, design = subset(health.des,
REGION == "Lanarkshire,Ayrshire & Arran"), family = "binomial")
Survey design:
subset.survey.design(health.des, REGION == "Lanarkshire,Ayrshire & Arran")
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.66097 0.11088 -5.961 3.08e-09 ***
HBOARDLanarkshire 0.01702 0.15521 0.110 0.913
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for binomial family taken to be 0.990106)
Number of Fisher Scoring iterations: 3
#
# now some logistic regression models to explore influences on SMOKING
#
summary(svyglm(SMOKER~SEX+AGEG+REGION,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
Call:
svyglm(formula = SMOKER ~ SEX + AGEG + REGION, design = subset(health.des,
!is.na(SMOKER)), family = "binomial")
Survey design:
subset.survey.design(health.des, !is.na(SMOKER))
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.92027 0.16836 -5.466 4.73e-08 ***
SEXfemale -0.06587 0.05076 -1.298 0.194410
AGEG25-29 0.61030 0.17973 3.396 0.000688 ***
AGEG35-39 0.36622 0.14929 2.453 0.014179 *
AGEG45-49 0.49244 0.15146 3.251 0.001153 **
AGEG55-59 0.25062 0.14305 1.752 0.079799 .
AGEG65-69 0.32661 0.15307 2.134 0.032887 *
AGEG70-74 0.25268 0.16417 1.539 0.123806
AGEG60-64 0.29999 0.14616 2.052 0.040152 *
AGEG50-54 0.29326 0.15695 1.868 0.061731 .
AGEG40-44 0.03624 0.15342 0.236 0.813274
AGEG30-34 -0.19737 0.16570 -1.191 0.233641
AGEG20-24 -0.39945 0.15669 -2.549 0.010812 *
REGIONGrampian & Tayside -0.01852 0.13501 -0.137 0.890867
REGIONLothian & Fife -0.05794 0.12436 -0.466 0.641314
REGIONBorders, Dumfries & Galloway -0.16581 0.15708 -1.056 0.291182
REGIONGlagow 0.14473 0.13502 1.072 0.283761
REGIONLanarkshire,Ayrshire & Arran 0.05118 0.12561 0.407 0.683698
REGIONForth Valley, Argyll & Clyde -0.01661 0.13307 -0.125 0.900662
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for binomial family taken to be 0.9999929)
Number of Fisher Scoring iterations: 4
summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+SC+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
Call:
svyglm(formula = SMOKER ~ SEX + AGEG + REGION + SEX * AGEG +
SC + NOFAD, design = subset(health.des, !is.na(SMOKER)),
family = "binomial")
Survey design:
subset.survey.design(health.des, !is.na(SMOKER))
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.68445 0.27064 -2.529 0.0115 *
SEXfemale 0.26976 0.27989 0.964 0.3352
AGEG25-29 0.67066 0.29497 2.274 0.0230 *
AGEG35-39 -0.21431 0.22495 -0.953 0.3408
AGEG45-49 0.04875 0.24178 0.202 0.8402
AGEG55-59 -0.22199 0.23247 -0.955 0.3396
AGEG65-69 -0.10241 0.24130 -0.424 0.6713
AGEG70-74 -0.15697 0.24767 -0.634 0.5262
AGEG60-64 -0.22489 0.27364 -0.822 0.4112
AGEG50-54 -0.47543 0.24051 -1.977 0.0481 *
AGEG40-44 -0.40650 0.26823 -1.515 0.1297
AGEG30-34 -1.01923 0.25325 -4.025 5.75e-05 ***
AGEG20-24 -1.27419 0.24841 -5.129 2.97e-07 ***
REGIONGrampian & Tayside 0.02349 0.14229 0.165 0.8689
REGIONLothian & Fife -0.03669 0.13112 -0.280 0.7796
REGIONBorders, Dumfries & Galloway -0.19086 0.16273 -1.173 0.2409
REGIONGlagow 0.21274 0.13806 1.541 0.1234
REGIONLanarkshire,Ayrshire & Arran 0.07497 0.12886 0.582 0.5607
REGIONForth Valley, Argyll & Clyde -0.01536 0.13601 -0.113 0.9101
SC 0.29678 0.02380 12.471 < 2e-16 ***
NOFAD -0.27962 0.05004 -5.588 2.37e-08 ***
SEXfemale:AGEG25-29 -0.88269 0.38027 -2.321 0.0203 *
SEXfemale:AGEG35-39 -0.26478 0.32817 -0.807 0.4198
SEXfemale:AGEG45-49 -0.67796 0.30189 -2.246 0.0247 *
SEXfemale:AGEG55-59 -0.48718 0.30300 -1.608 0.1079
SEXfemale:AGEG65-69 -0.34475 0.31941 -1.079 0.2805
SEXfemale:AGEG70-74 -0.40652 0.32935 -1.234 0.2171
SEXfemale:AGEG60-64 -0.32320 0.36035 -0.897 0.3698
SEXfemale:AGEG50-54 -0.01855 0.33280 -0.056 0.9555
SEXfemale:AGEG40-44 -0.76932 0.34928 -2.203 0.0277 *
SEXfemale:AGEG30-34 -0.10401 0.33665 -0.309 0.7574
SEXfemale:AGEG20-24 -0.14086 0.34942 -0.403 0.6869
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+CARSTG5+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
Call:
svyglm(formula = SMOKER ~ SEX + AGEG + REGION + SEX * AGEG +
CARSTG5 + NOFAD, design = subset(health.des, !is.na(SMOKER)),
family = "binomial")
Survey design:
subset.survey.design(health.des, !is.na(SMOKER))
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.84591 0.25880 -3.269 0.001085 **
SEXfemale 0.24681 0.27504 0.897 0.369558
AGEG25-29 0.92319 0.28223 3.271 0.001075 **
AGEG35-39 0.17917 0.21854 0.820 0.412336
AGEG45-49 0.47653 0.23109 2.062 0.039225 *
AGEG55-59 0.18258 0.22250 0.821 0.411899
AGEG65-69 0.26896 0.22884 1.175 0.239895
AGEG70-74 0.26452 0.23738 1.114 0.265161
AGEG60-64 0.18358 0.25527 0.719 0.472065
AGEG50-54 -0.04392 0.23055 -0.191 0.848918
AGEG40-44 0.03001 0.25316 0.119 0.905640
AGEG30-34 -0.62017 0.23695 -2.617 0.008879 **
AGEG20-24 -0.84079 0.24086 -3.491 0.000484 ***
REGIONGrampian & Tayside 0.01935 0.14091 0.137 0.890772
REGIONLothian & Fife -0.13188 0.12894 -1.023 0.306433
REGIONBorders, Dumfries & Galloway -0.15780 0.16500 -0.956 0.338918
REGIONGlagow -0.13488 0.14070 -0.959 0.337769
REGIONLanarkshire,Ayrshire & Arran -0.14715 0.13507 -1.089 0.275982
REGIONForth Valley, Argyll & Clyde -0.17572 0.13850 -1.269 0.204573
CARSTG5 0.23819 0.02199 10.829 < 2e-16 ***
NOFAD -0.26871 0.04293 -6.260 4.03e-10 ***
SEXfemale:AGEG25-29 -0.84820 0.37402 -2.268 0.023364 *
SEXfemale:AGEG35-39 -0.28549 0.32528 -0.878 0.380147
SEXfemale:AGEG45-49 -0.63784 0.29589 -2.156 0.031137 *
SEXfemale:AGEG55-59 -0.47193 0.30514 -1.547 0.121992
SEXfemale:AGEG65-69 -0.28833 0.31546 -0.914 0.360747
SEXfemale:AGEG70-74 -0.33610 0.32837 -1.024 0.306081
SEXfemale:AGEG60-64 -0.22750 0.34888 -0.652 0.514368
SEXfemale:AGEG50-54 0.02436 0.33317 0.073 0.941724
SEXfemale:AGEG40-44 -0.72016 0.34619 -2.080 0.037533 *
SEXfemale:AGEG30-34 0.01953 0.32453 0.060 0.952011
SEXfemale:AGEG20-24 -0.07382 0.34487 -0.214 0.830509
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for binomial family taken to be 1.000655)