Exemplar 2 R code

names(shs)

#
#
# and to see a histogram of the weights and the range of weights 
# (pretty large) and the ratio of the most extereme weights
#

 hist(shs$IND_WT,col='red')
 range(shs$IND_WT)
 max(shs$IND_WT)/min(shs$IND_WT)

#  to give R access to the survey functions do

library(survey)

# The first step is to create a SURVEY DESIGN OBJECT that 
# will contain both the data and the information about the design
#
# note that in the latest version of R survey 2.9.1 it is necessary
# to use the grossing-up weight to get the correct design effects
# Go to # html help #packages #survey #svymean to see where 
# this is explained 
#
# version 3 will have an option to overcome this
#
##########################################

shs.des <- svydesign(id=~PSU, weights=~GROSSWT,strata=~STRATUM,data=shs)
print(summary(shs.des))

#
# You will have seen a summary of the strata and for each stratum
# the number of PSUs and observations per stratum
# in the LAs where there was no clustering the no. of clusters is
# the same as the number of observations
# NOTE THAT SOME STRATA HAVE BEEN COMBINED TO PREVENT LONELY PSUS
# #############   RESULTS FOR TABLE 2.3  IN EXEMPLAR 2 HOME PAGE##############
#  back to top 
# now use the command svymean to get the mean of
# the 0/1 variable for internet use 
# for the whole sample and for subgroups
#

svymean(~INTUSE,shs.des,deff=T)

#
# by sex
#

svyby(~INTUSE,~SEX,shs.des,svymean,keep.var=T,deff=T)

#
# The error messages are because splitting the data by sex has
# produced a few more lonely PSUS. See the html help for
# survey.lonely.psu by following links from html help menu to #packages #survey 
# There are some options to adjust estimates for this 
# this code will run them
#

options("survey.adjust.domain.lonely"=T)
options("survey.lonely.psu"='adjust')
svyby(~INTUSE,~SEX,shs.des,svymean,keep.var=T,deff=T)

#
# this increases the design effect by only a tiny amount 
# so not worth fussing with here
#
# Now proportions different using for different hours per week
#
# Need to usesubset  to get rid of the cases where this
# is missing for non-internet users
#

svymean(~RC5,subset(shs.des,INTUSE==1),deff=T)

#
####################   THIS CODE FOR TABLE 2.4 xxendlink ################
#  back to top 
#
# testing efect of different designs
#

shs.des <- svydesign(id=~UNIQID, weights=~GROSSWT,data=shs)
svymean(~INTUSE,shs.des,deff=T)
shs.des <- svydesign(id=~PSU, weights=~GROSSWT,data=shs)
svymean(~INTUSE,shs.des,deff=T)
shs.des <- svydesign(id=~UNIQID, weights=~GROSSWT,strata=~STRATUM,data=shs)
svymean(~INTUSE,shs.des,deff=T)
shs.des <- svydesign(id=~PSU, weights=~GROSSWT,strata=~STRATUM,data=shs)
svymean(~INTUSE,shs.des,deff=T)

#
#
###################  now tables xxendlink and chi squared tests for tables 2.5 2.6 2.7################
#  back to top 
#
#  now some chisquare tests
#  the R help (internet help menu #packages #surveys #svychisq)
#   gives details of other options

svychisq(~INTUSE+SEX,shs.des)
chisq.test(table(shs$INTUSE,shs$SEX),correct=F)

# table gives sum of weights by default
#

svytable(~INTUSE+SEX,shs.des)
svytable(~SEX+GROC,shs.des)

#
# table gives sum of weights
# more work would be needed to produce a nice table from this
#

svychisq(~GROC+SEX,shs.des)
chisq.test(table(shs$GROC,shs$SEX),correct=F)
svychisq(~RC5+EMP_STA,subset(shs.des,INTUSE==1))
chisq.test(table(shs$RC5,shs$EMP_STA),correct=T)

#
######################## linkxx by council area xxendlink############
#  back to top 
#

svyby(~INTUSE,~COUNCIL,shs.des,svymean,keep.var=T,deff=T)

#
# now get mean internet use and CI and DEFF for Just Orkney
#

svymean(~INTUSE,subset(shs.des,COUNCIL=='Orkney Islands'),deff=T)

# 
#
# now logistic regression analyses for survey data ren=moving missing categories
#
################# linkxx table 2.7 xxendlink ######################
#  back to top

summary(svyglm(INTUSE~GROUPINC,subset(shs.des,!is.na(GROUPINC)),family='binomial'))

# add rurality

summary(svyglm(INTUSE~GROUPINC+SHS_6CLA,subset(shs.des,!is.na(GROUPINC)),family='binomial'))

################### linkxx figure 1.1 xxlink ##########################################
#  back to top 
#

fit<-svyglm(INTUSE~GROUPINC,design=shs.des)

# now some more complicated code to fit the curves for age and sex # # shown on the first page of this exemplar
# using the splines library in R

library(splines)

# replace one missing value in age
# and calculate terms to fit spline model
# using cubic splines with 3 knots
#

shs$AGE[is.na(shs$AGE)]<-40
splinexs<-bs(shs$AGE,knots=c(25,45,65),degree=3)
SP1<-splinexs[,1]
SP2<-splinexs[,2]
SP3<-splinexs[,3]
SP4<-splinexs[,4]
SP5<-splinexs[,5]
SP6<-splinexs[,6]

# add extra vars to survey design

shs.des<-update(shs.des,SP1=SP1,SP2=SP2,SP3=SP3,SP4=SP4,SP5=SP5,SP6=SP6,AGE=AGE)

# fit spline model
#

fit<-svyglm(INTUSE~SP1+SP2+SP3+SP5+SP6
      +(as.integer(SEX)-1)+ (as.integer(SEX)-1)*SP1+
     (as.integer(SEX)-1)*SP2+(as.integer(SEX)-1)*SP3+
        (as.integer(SEX)-1)*SP4+(as.integer(SEX)-1)*SP5+ 
           (as.integer(SEX)-1)*SP6                                            
            , design=shs.des, family='binomial')

#
# now the age and sex plot shown on Exemplar 1

plot(c(15,83),c(0,65),type='n',xlab='age',ylab='% internet users',)

# fit for men and women calculated from predicted vals

ilogit<-function(x){1/(1+exp(-x))}
Mfit<- ilogit(predict(fit))[shs$SEX=='male'][match(c(16:80),shs$AGE[shs$SEX=='male'])]*100
Wfit<- ilogit(predict(fit))[shs$SEX=='female'][match(c(16:80),shs$AGE[shs$SEX=='female'])]*100

#
# plot rates and add the fitted lines to the plot
#

agemeans<-svyby(~INTUSE,~SEX+AGE,shs.des,svymean)
usebyageM<-agemeans[1:65,3]*100
usebyageW<-agemeans[1:65+65,3]*100
plot(c(15,83),c(0,65),type='n',xlab='age',ylab='% internet users',)
points(c(16:79,82),usebyageM)
points(c(16:79,82),usebyageW,col=2,pch=16)
lines(c(16:79,82),Mfit,lwd=2)
lines(c(16:79,82),Wfit,lwd=2,col=2)

#
##########################################################
# now look at additional effect of other variables
#

summary(update(fit,~.+SHS_6CLA))
summary(update(fit,~.+EMP_STA))

#
# now let us look at the effect of local authority on
# internet use when adjusted for age and sex
#
# unadjusted effect
#

fit1<-svyglm(INTUSE~COUNCIL,design=shs.des)

fit2<-svyglm(INTUSE~COUNCIL+SP1+SP2+SP3+(as.integer(SEX)-1)+(as.integer(SEX)-1)*SP1+(as.integer(SEX)-1)*SP2+(as.integer(SEX)-1)*SP3, design=shs.des, family='binomial')

#
# adjusted age sex income
#

fit3<-svyglm(INTUSE~COUNCIL+SP1+SP2+SP3+(as.integer(SEX)-1)+(as.integer(SEX)-1)*SP1+(as.integer(SEX)-1)*SP2+(as.integer(SEX)-1)*SP3+GROUPINC, design=shs.des, family='binomial')

#
# this next bit compares ranking of adjusted and unadjusted
#  coefficients. It seems that this large variation between local 
# authorities is not explained by age sex or household income
#

comp<-cbind(rank(-c(0,fit1$coefficients[2:32])), 
rank(-fit2$coefficients[1:32]+fit2$coefficients[1]), 
rank(-fit3$coefficients[1:32]+fit3$coefficients[1]))
dimnames(comp)[[1]][1]<-'Aberdeen City'
dimnames(comp)[[2]]<-c('Unadjusted','Age sex','Age sex wealth')
print('Ranking of internet use')
print(comp)