##############################################

# HOW TO USE THIS FILE

# A text version of this file (ex3.R) is also available which can just be sourced

# into the R program

# This file is intended for you to use interactively by pasting into R

# Comments are in black - red bits are commands that can be pasted into R

######### ANY TEXT AFTER THE # CHARACTER ARE TREATED AS COMMENTS

# data stored in c:/GR/aprojects/peas/web/exemp3/data/ex3.Rdata

# this file is c:/GR/aprojects/peas/web/exemp3/program_code/ex3_R.html

##############################################

# data are in a data frame health

# to get the names of the variables

names(health)

 [1] "AGE"      "SEX"      "REGION"   "HBOARD"   "CARSTAIR" "CARSTG5"
 [7] "CIGST1"   "CIGST2"   "ARCHPT"   "NOFAD"    "SC"       "WEIGHTA"
[13] "GROSSWT"  "IDNO"     "PSUNO"    "PSU"      "MONTH"    "QUARTER"
[19] "STRATUM"  "NIN"      "REGSTRAT" "AGEG"

# the cigarette smoking data is in a factor called health$CIGST1

# to get a table of this variable do

table(health$CIGST1)

                 Refused/Not answered                             Dont know
                                   14                                    16
                schedule not obtained                        not applicable
                                    3                                     3
       Never smoked cigarettes at all Used to smoke cigarettes occasionally
                                 3711                                   269
   Used to smoke cigarettes regularly              Current cigarette smoker
                                 1895                                  3136

# the next bit of code makes a new 1/0 variable for smokers

# and stores it in the data frame (health)

health$SMOKER<- (health$CIGST1=='Current cigarette smoker')

health$SMOKER[health$CIGST1=='Dont know']<-NA

health$SMOKER[health$CIGST1=='Refused/Not answered']<-NA

health$SMOKER[health$CIGST1=='schedule not obtained']<-NA
#

# to check that it has worked correctly

# you will see that we have set the result to the R missing value (NA)

# when question was not answered

table(health$CIGST1,health$SMOKER,exclude=NULL)

                                        FALSE TRUE <NA>
  Refused/Not answered                     0     0   14
  Dont know                                0     0   16
  schedule not obtained                    0     0    3
  not applicable                           3     0    0
  Never smoked cigarettes at all        3711     0    0
  Used to smoke cigarettes occasionally  269     0    0
  Used to smoke cigarettes regularly    1895     0    0
  Current cigarette smoker                 0  3136    0

# now it is time to try using the survey package

# to give R access to the survey functions do

library(survey)

# The first step is to create a SURVEY DESIGN OBJECT that

# will contain both the data and the information about the design

# This design has WEIGHTS (WEIGHTA),

# it is clustered so it has PSUs (PSU)

# and it is stratified (implicitly) by deprivation category within region

# and the PSUs have been grouped into implicit strata (REGSTRAT)

# with just 2 or three PSUs per stratum

# There are TWO kinds of survey design

# 1. Using simple methods , usually for surveys WITHOUT POST-STRATIFICATION

# 2. Using replicate weights, usually for surveys with POST-STARTIFICATION

# The Scottish Health Survey STRATIFICATION and POST-STRATIFICATION so

# to allow for all features we should use functions of the second kind

# But first we set up a design to allow for clustering weighting and stratification

# that could be analysed by the simple methods

health.des <- svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health)

# see the html help file for the function svydesign for more explanation

# the ~ sign refers to it being one of the columns of the data

# the R object health.des now contains all the information about the design

# including the data

# To get details of the design

summary(health.des)

Stratified 1 - level Cluster Sampling design
With ( 312 ) clusters.
svydesign(id = ~PSU, strata = ~REGSTRAT, weights = ~WEIGHTA,
    data = health)
Probabilities:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
 0.1260  0.7825  1.1090  1.3920  1.7710 15.3900
Stratum sizes:
           101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 201 202
obs         48  45  58  65  52  71  68  65  49  54  69  54  59  56  89  57  57
design.PSU   2   2   2   2   2   2   2   2   2   2   2   2   2   2   3   2   2
actual.PSU   2   2   2   2   2   2   2   2   2   2   2   2   2   2   3   2   2

lines omitted

           719
obs         51
design.PSU   2
actual.PSU   2
Data variables:
 [1] "AGE"      "SEX"      "REGION"   "HBOARD"   "CARSTAIR" "CARSTG5"
 [7] "CIGST1"   "CIGST2"   "ARCHPT"   "NOFAD"    "SC"       "WEIGHTA"
[13] "GROSSWT"  "IDNO"     "PSUNO"    "PSU"      "MONTH"    "QUARTER"
[19] "STRATUM"  "NIN"      "REGSTRAT" "AGEG"     "SMOKER"

# now we can use the svymean command to get the smoking rate and its standard error

svymean(~SMOKER,health.des,deff=T)

       mean SE DEff
SMOKER   NA NA   NA

# annoyingly this gives a missing result because of the missing values

# but we can get over this by using a command to restrict the survey design

# to the non-missing cases (!=not is.na identifies missing data)

svymean(~SMOKER,subset(health.des,!is.na(SMOKER)))

          mean    SE
SMOKER 0.33312 0.006

# this gives a rate of 0.332 (33.2%) and its standard error of 0.006 (0.6%)

# we can compare this with the corresponding unweighted mean

# which again has to be restricted to the unweighted cases

mean(health$SMOKER[!is.na(health$SMOKER)])

[1] 0.3479033

# this gives the higher rate of 34.79%

# We can explore how different features affect the design effect by

# setting up different designs (see web page for this exemplar)

svymean(~SMOKER,subset(svydesign(id=~IDNO, weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)

            mean        SE   DEff
SMOKER 0.3331194 0.0057008 1.3187

svymean(~SMOKER,subset(svydesign(id=~PSU,

weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)

           mean        SE DEff
SMOKER 0.3331194 0.0074961 2.28

svymean(~SMOKER,subset(svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)

            mean        SE   DEff
SMOKER 0.3331194 0.0060102 1.4657

# the next stage is to convert it into a design with replicate weights

# we use the default options which produces, by default, jacknife samples

# the BRR method gives almost identical results

# see html help for as.svrepdesign

health.rep.des <- as.svrepdesign(health.des)

svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)))

          mean    SE
SMOKER 0.33312 0.006

# now we can post-stratify this by region and then by

# age and sex

# This will not affect the weights, since this is already included here

# but it MIGHT improve the precisio of estimates

tab.region <- xtabs(GROSSWT~REGION,data=health)

health.rep.des<-postStratify(health.rep.des,~REGION,tab.region)

svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)

            mean        SE   DEff
SMOKER 0.3331194 0.0060317 1.4762

tab.agesex <- xtabs(GROSSWT~SEX+AGEG,data=health)

health.rep.des<-postStratify(health.rep.des,~SEX+AGEG,tab.agesex)

svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)

            mean        SE   DEff
SMOKER 0.3331194 0.0059543 1.4386

# In fact you will see that it does little to improve things

# now we look at the precision of estimates for regions and Health Boards

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lothian & Fife'))

          mean     SE
SMOKER 0.32138 0.0114

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lothian'))

          mean     SE
SMOKER 0.30387 0.0152

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Fife'))

          mean    SE
SMOKER 0.35144 0.018

summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lothian & Fife'),family='binomial'))

Call:
svyglm(formula = SMOKER ~ HBOARD, design = subset(health.des,
    REGION == "Lothian & Fife"), family = "binomial")

Survey design:
subset.survey.design(health.des, REGION == "Lothian & Fife")

Coefficients:
              Estimate Std. Error t value Pr(>|t|)
(Intercept)   -0.61273    0.07894  -7.762 1.38e-14 ***
HBOARDLothian -0.21622    0.10769  -2.008   0.0448 *
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1

(Dispersion parameter for binomial family taken to be 0.9959916)

Number of Fisher Scoring iterations: 4

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lanarkshire,Ayrshire & Arran'))

          mean     SE
SMOKER 0.34259 0.0131

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lanarkshire'))

          mean     SE
SMOKER 0.34435 0.0187

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Ayrshire & Arran'))

          mean     SE
SMOKER 0.34052 0.0249

summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lanarkshire,Ayrshire & Arran'),family='binomial'))

Call:
svyglm(formula = SMOKER ~ HBOARD, design = subset(health.des,
    REGION == "Lanarkshire,Ayrshire & Arran"), family = "binomial")
Survey design:
subset.survey.design(health.des, REGION == "Lanarkshire,Ayrshire & Arran")
Coefficients:
                  Estimate Std. Error t value Pr(>|t|)
(Intercept)       -0.66097    0.11088  -5.961 3.08e-09 ***
HBOARDLanarkshire  0.01702    0.15521   0.110    0.913
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for binomial family taken to be 0.990106)
Number of Fisher Scoring iterations: 3

# now some logistic regression models to explore influences on SMOKING

summary(svyglm(SMOKER~SEX+AGEG+REGION,design=subset(health.des,!is.na(SMOKER)),family='binomial'))

Call:
svyglm(formula = SMOKER ~ SEX + AGEG + REGION, design = subset(health.des,
    !is.na(SMOKER)), family = "binomial")
Survey design:
subset.survey.design(health.des, !is.na(SMOKER))
Coefficients:
                                   Estimate Std. Error t value Pr(>|t|)
(Intercept)                        -0.92027    0.16836  -5.466 4.73e-08 ***
SEXfemale                          -0.06587    0.05076  -1.298 0.194410
AGEG25-29                           0.61030    0.17973   3.396 0.000688 ***
AGEG35-39                           0.36622    0.14929   2.453 0.014179 *
AGEG45-49                           0.49244    0.15146   3.251 0.001153 **
AGEG55-59                           0.25062    0.14305   1.752 0.079799 .
AGEG65-69                           0.32661    0.15307   2.134 0.032887 *
AGEG70-74                           0.25268    0.16417   1.539 0.123806
AGEG60-64                           0.29999    0.14616   2.052 0.040152 *
AGEG50-54                           0.29326    0.15695   1.868 0.061731 .
AGEG40-44                           0.03624    0.15342   0.236 0.813274
AGEG30-34                          -0.19737    0.16570  -1.191 0.233641
AGEG20-24                          -0.39945    0.15669  -2.549 0.010812 *
REGIONGrampian & Tayside           -0.01852    0.13501  -0.137 0.890867
REGIONLothian & Fife               -0.05794    0.12436  -0.466 0.641314
REGIONBorders, Dumfries & Galloway -0.16581    0.15708  -1.056 0.291182
REGIONGlagow                        0.14473    0.13502   1.072 0.283761
REGIONLanarkshire,Ayrshire & Arran  0.05118    0.12561   0.407 0.683698
REGIONForth Valley, Argyll & Clyde -0.01661    0.13307  -0.125 0.900662
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for binomial family taken to be 0.9999929)
Number of Fisher Scoring iterations: 4

summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+SC+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))

Call:
svyglm(formula = SMOKER ~ SEX + AGEG + REGION + SEX * AGEG +
    SC + NOFAD, design = subset(health.des, !is.na(SMOKER)),
    family = "binomial")
Survey design:
subset.survey.design(health.des, !is.na(SMOKER))
Coefficients:
                                   Estimate Std. Error t value Pr(>|t|)
(Intercept)                        -0.68445    0.27064  -2.529   0.0115 *
SEXfemale                           0.26976    0.27989   0.964   0.3352
AGEG25-29                           0.67066    0.29497   2.274   0.0230 *
AGEG35-39                          -0.21431    0.22495  -0.953   0.3408
AGEG45-49                           0.04875    0.24178   0.202   0.8402
AGEG55-59                          -0.22199    0.23247  -0.955   0.3396
AGEG65-69                          -0.10241    0.24130  -0.424   0.6713
AGEG70-74                          -0.15697    0.24767  -0.634   0.5262
AGEG60-64                          -0.22489    0.27364  -0.822   0.4112
AGEG50-54                          -0.47543    0.24051  -1.977   0.0481 *
AGEG40-44                          -0.40650    0.26823  -1.515   0.1297
AGEG30-34                          -1.01923    0.25325  -4.025 5.75e-05 ***
AGEG20-24                          -1.27419    0.24841  -5.129 2.97e-07 ***
REGIONGrampian & Tayside            0.02349    0.14229   0.165   0.8689
REGIONLothian & Fife               -0.03669    0.13112  -0.280   0.7796
REGIONBorders, Dumfries & Galloway -0.19086    0.16273  -1.173   0.2409
REGIONGlagow                        0.21274    0.13806   1.541   0.1234
REGIONLanarkshire,Ayrshire & Arran  0.07497    0.12886   0.582   0.5607
REGIONForth Valley, Argyll & Clyde -0.01536    0.13601  -0.113   0.9101
SC                                  0.29678    0.02380  12.471  < 2e-16 ***
NOFAD                              -0.27962    0.05004  -5.588 2.37e-08 ***
SEXfemale:AGEG25-29                -0.88269    0.38027  -2.321   0.0203 *
SEXfemale:AGEG35-39                -0.26478    0.32817  -0.807   0.4198
SEXfemale:AGEG45-49                -0.67796    0.30189  -2.246   0.0247 *
SEXfemale:AGEG55-59                -0.48718    0.30300  -1.608   0.1079
SEXfemale:AGEG65-69                -0.34475    0.31941  -1.079   0.2805
SEXfemale:AGEG70-74                -0.40652    0.32935  -1.234   0.2171
SEXfemale:AGEG60-64                -0.32320    0.36035  -0.897   0.3698
SEXfemale:AGEG50-54                -0.01855    0.33280  -0.056   0.9555
SEXfemale:AGEG40-44                -0.76932    0.34928  -2.203   0.0277 *
SEXfemale:AGEG30-34                -0.10401    0.33665  -0.309   0.7574
SEXfemale:AGEG20-24                -0.14086    0.34942  -0.403   0.6869
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1

summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+CARSTG5+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))

Call:
svyglm(formula = SMOKER ~ SEX + AGEG + REGION + SEX * AGEG +
    CARSTG5 + NOFAD, design = subset(health.des, !is.na(SMOKER)),
    family = "binomial")
Survey design:
subset.survey.design(health.des, !is.na(SMOKER))
Coefficients:
                                   Estimate Std. Error t value Pr(>|t|)
(Intercept)                        -0.84591    0.25880  -3.269 0.001085 **
SEXfemale                           0.24681    0.27504   0.897 0.369558
AGEG25-29                           0.92319    0.28223   3.271 0.001075 **
AGEG35-39                           0.17917    0.21854   0.820 0.412336
AGEG45-49                           0.47653    0.23109   2.062 0.039225 *
AGEG55-59                           0.18258    0.22250   0.821 0.411899
AGEG65-69                           0.26896    0.22884   1.175 0.239895
AGEG70-74                           0.26452    0.23738   1.114 0.265161
AGEG60-64                           0.18358    0.25527   0.719 0.472065
AGEG50-54                          -0.04392    0.23055  -0.191 0.848918
AGEG40-44                           0.03001    0.25316   0.119 0.905640
AGEG30-34                          -0.62017    0.23695  -2.617 0.008879 **
AGEG20-24                          -0.84079    0.24086  -3.491 0.000484 ***
REGIONGrampian & Tayside            0.01935    0.14091   0.137 0.890772
REGIONLothian & Fife               -0.13188    0.12894  -1.023 0.306433
REGIONBorders, Dumfries & Galloway -0.15780    0.16500  -0.956 0.338918
REGIONGlagow                       -0.13488    0.14070  -0.959 0.337769
REGIONLanarkshire,Ayrshire & Arran -0.14715    0.13507  -1.089 0.275982
REGIONForth Valley, Argyll & Clyde -0.17572    0.13850  -1.269 0.204573
CARSTG5                             0.23819    0.02199  10.829  < 2e-16 ***
NOFAD                              -0.26871    0.04293  -6.260 4.03e-10 ***
SEXfemale:AGEG25-29                -0.84820    0.37402  -2.268 0.023364 *
SEXfemale:AGEG35-39                -0.28549    0.32528  -0.878 0.380147
SEXfemale:AGEG45-49                -0.63784    0.29589  -2.156 0.031137 *
SEXfemale:AGEG55-59                -0.47193    0.30514  -1.547 0.121992
SEXfemale:AGEG65-69                -0.28833    0.31546  -0.914 0.360747
SEXfemale:AGEG70-74                -0.33610    0.32837  -1.024 0.306081
SEXfemale:AGEG60-64                -0.22750    0.34888  -0.652 0.514368
SEXfemale:AGEG50-54                 0.02436    0.33317   0.073 0.941724
SEXfemale:AGEG40-44                -0.72016    0.34619  -2.080 0.037533 *
SEXfemale:AGEG30-34                 0.01953    0.32453   0.060 0.952011
SEXfemale:AGEG20-24                -0.07382    0.34487  -0.214 0.830509
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for binomial family taken to be 1.000655)