Exemplar 4 R code

###############################################
# HOW TO USE THIS FILE
# This is an HTML version of the R script file ex4.R
# It is intended to show you the code and to allow links.
# NOT to use interactively to run the code
# For that download the script file ex4.R
# to view the ouptut from these commands in html go to R results page

red stuff (R commands)

#

#
Links in this page
Equal opportunities policies weighted and unweighted - Table 4.3
E O policy by workplace size Table 4.4
Other factors by equal opportunities policy Table 4.5
Logistic regression Table 4.6
Logistic regression Table 4.7
Finite population corrections Table 4.8


# to get names of variables
 names(wers)
#
# the ethnic group percentages are in a column labelled ETHNIC
# to get a histogram of this variable do
#
 hist(wers$ETHNIC)
#
#  now it is time to try using the survey package
#  to give R access to the survey functions do - see mini guide if this fails
#
 library(survey)

Back to top

## ################ table 4.3 ###############################
#
#
# The first step is to create a SURVEY DESIGN OBJECT that 
# will contain both the data and the information about the design
#
# This design has WEIGHTS (EST_WT),but to get the correct
# design effects without using the option deff="replace"
# we need to use the weights that add to the population (GROSSWT) 
#  and it is stratified  by size of workplace
# and sector. There are 71 strata indexed by STRATA
#  Since it is not clustered we set the sampling unit ids
#  to be the serial number SERNO, different for each workplace
 wers.des <- svydesign(id=~SERNO, strata=~STRATA,weights=~GROSSWT,data=wers)
# 
#
# now we can look at the survey weighted mean of the proportions 
# of workplaces with an EO policy using the svymean command
# giving the design as the second argument to the function
#
svymean(~EO,wers.des)
#
# sadly this just gives a missing value since EO has 7 missing values
# but the answer is to remove the missing data from the survey design
# see the help for is.na and for subset.surveydesign 
#
svymean(~EO,subset(wers.des,!is.na(EO)),deff=T)
#
# we can compare this with the unweighted mean had the design been different
#
wers.des <- svydesign(id=~SERNO, strata=~STRATA,data=wers)
svymean(~EO,subset(wers.des,!is.na(EO)),deff=T)
#
Back to top
###################Table 4.4##################################
#
# to get the weighted means for subgroups by employment size
# we can use the svyby command to apply svymean to subgroups
# first resetting the design
#
wers.des <- svydesign(id=~SERNO, strata=~STRATA,weight=GROSSWT,data=wers)
svyby(~EO,~NEMPSIZE,subset(wers.des,!is.na(EO)),svymean,keep.var=TRUE)
#
# the warning messages refer to lonely PSUs (see theory section on checking data)
# This means that the standard errors masy be slightly underestimated
#
Back to top
#######################Table 4.5########################################
#
# Now we get means of percent female by EO
#
svyby(~FEMALE,~EO,subset(wers.des,!is.na(FEMALE)),svymean)
#
# again the easisest way to get the standard error of
# the difference is with a with svyglm (see help)
#
summary(svyglm(FEMALE~EO,wers.des) )
#
# obviously workplaces with an EO policy have a higher % women
# we can now do similar analyses for % ethnic minority employees
#
 svyby(~ETHNIC,~EO,subset(wers.des,!is.na(ETHNIC)),svymean)
 summary(svyglm(ETHNIC~EO,wers.des))
#
# like females a bigger effect in the weighted analysis 
# but clear evidence of more ethnic minority employees when policy in place
#
# now find weighted percent in each  categories of disabled worker %s
# since DISABGRP is a factor R gives proportions
#
 svymean(~DISABGRP,subset(wers.des,!is.na(DISABGRP)))
#
# now means of EO by disabled groups
#
svyby(~EO,~DISABGRP,subset(wers.des,!is.na(DISABGRP) & !is.na(EO)),svymean)
#
# svyglm compares percent in each disabled group category 
# between eo and non-eo workplaces
#
 summary(svyglm((DISABGRP=='none')~EO,subset(wers.des,!is.na(wers$DISABGRP) & !is.na(EO))))
 summary(svyglm((DISABGRP=='under 3%')~EO,wers.des[!is.na(wers$DISABGRP),]))
 summary(svyglm((DISABGRP=='3%+')~EO,wers.des[!is.na(wers$DISABGRP),]))
##############################################################
# now the same things for unweighted data 
 wers.des <- svydesign(id=~SERNO, strata=~STRATA,data=wers)
#
svyby(~FEMALE,~EO,subset(wers.des,!is.na(FEMALE)),svymean)
summary(svyglm(FEMALE~EO,wers.des) )
svyby(~ETHNIC,~EO,subset(wers.des,!is.na(ETHNIC)),svymean)
summary(svyglm(ETHNIC~EO,wers.des))
svymean(~DISABGRP,subset(wers.des,!is.na(DISABGRP)))
svyby(~EO,~DISABGRP,subset(wers.des,!is.na(DISABGRP) & !is.na(EO)),svymean)
 summary(svyglm((DISABGRP=='none')~EO,subset(wers.des,!is.na(wers$DISABGRP) & !is.na(EO))))
 summary(svyglm((DISABGRP=='under 3%')~EO,wers.des[!is.na(wers$DISABGRP),]))
 summary(svyglm((DISABGRP=='3%+')~EO,wers.des[!is.na(wers$DISABGRP),]))
Back to top
##########################################################
# Table 4.6 
# now the svyglm command and the glm commands to
# fit logistic regressions
#  first restore design
 wers.des <- svydesign(id=~SERNO, strata=~STRATA,weights=~GROSSWT,data=wers)
 summary(svyglm(EO~ETHNIC+FEMALE+DISABGRP,wers.des,family='binomial'))
#
# and unweighted
#
 wers.des <- svydesign(id=~SERNO, strata=~STRATA,data=wers)
 summary(svyglm(EO~ETHNIC+FEMALE+DISABGRP,wers.des,family='binomial'))
Back to top 
##################################################################
# table 4.7
# now explore models that include workplace size  
 wers.des <- svydesign(id=~SERNO, strata=~STRATA,weights=~GROSSWT,data=wers)
 summary(svyglm(EO~ETHNIC+FEMALE+DISABGRP+NEMPSIZE,wers.des,family='binomial'))
#
# and without weighting
#
 wers.des <- svydesign(id=~SERNO, strata=~STRATA,data=wers)
 summary(svyglm(EO~ETHNIC+FEMALE+DISABGRP+NEMPSIZE,wers.des,family='binomial'))
Back to top
###############################################################
#
#  table 4.8 with finite population corrections
#
wers.des <- svydesign(id=~SERNO,fpc=~SAMPFRAC, strata=~STRATA,weight=~GROSSWT,data=wers)
svymean(~EO,subset(wers.des,!is.na(EO)))
svyby(~EO,~NEMPSIZE,subset(wers.des,!is.na(EO)),svymean,keep.var=TRUE)
#
# and without
wers.des <- svydesign(id=~SERNO, strata=~STRATA,weight=~GROSSWT,data=wers)
svymean(~EO,subset(wers.des,!is.na(EO)))
svyby(~EO,~NEMPSIZE,subset(wers.des,!is.na(EO)),svymean,keep.var=TRUE)
#
#############################################################
# now some more modelling stuff to make plots
#
 wers.des <- svydesign(id=~SERNO, strata=~STRATA,weights=~GROSSWT,data=wers)
# disabled now dropped out
#
 summary(svyglm(EO~ETHNIC+FEMALE+NEMPSIZE+ETHNIC,wers.des,family='binomial'))

# now  make a factor to test linearity of percent female
#  
wers.des<-update( wers.des,GPFEMALE=factor(ceiling(FEMALE/20)*(FEMALE>0)+(1*FEMALE==0),
              labels=c('0-<20','20-<40','40-<60','60-<80','80-100') ) )
 summary(svyglm(EO~ETHNIC+GPFEMALE+NEMPSIZE,wers.des,family='binomial'))
#
# Linearity for grouped size looks not too bad
# But there seems to be steep increase for workplaces with >50% 
# female. For robustness, stick with this model where we fit both of these as categories
#
# fit as simple (not survey) glm
#
#
# fit as simple (not survey) glm
#
summary(glm(EO~ETHNIC+GPFEMALE+NEMPSIZE,wers,family='binomial'))
#
# we can see that the effect of % of female employees is less marked in the bigger workplaces
#  note also that the efect of ethnic group still stands out  clearly in these
# analyses with the odds of EO increasing by around 1.8 (exp(0.3*20)) when the 
# percentage ethnic goes from 0 to 20%.
#
# This final bit of code makes a plot of %EO by FEMALE and SIZE to understand findings.
#

wers$GPFEMALE=factor(ceiling(wers$FEMALE/20)*(wers$FEMALE>0)+(1*wers$FEMALE==0),
              labels=c('0-<20','20-<40','40-<60','60-<80','80-100') ) 
forpred<-data.frame(NEMPSIZE=rep(0:5,5),ETHNIC=rep(2,30),
              FEMALE=rep(c(10,30,50,70,90),rep(6,5)))
forpred$GPFEMALE=factor(ceiling(forpred$FEMALE/20)*(forpred$FEMALE>0)+(1*forpred$FEMALE==0),
              labels=c('0-<20','20-<40','40-<60','60-<80','80-100') ) 
forpred$NEMPSIZE<-factor(forpred$NEMPSIZE,labels=c('10-24','25-49','50-99','100-199',
'200-499','500+'))


wt.fit<-predict(svyglm(EO~ETHNIC+GPFEMALE+NEMPSIZE,wers.des,family='binomial'),forpred)
unwt.fit<-predict(glm(EO~ETHNIC+GPFEMALE+NEMPSIZE,wers,family='binomial'),forpred)

invlogit<-function(x){1/(1+exp(-x))}
par(mar=c(5,5,1,1),cex.axis=.9,cex.lab=.9,oma=c(0,0,0,0))
plot(c(0,5),c(40,100),type='n',ylab='% with eo policy',xlab='workplace size group')
for (i in 1:5) points(0:5,100*invlogit(wt.fit)[1:6+(i-1)*6],pch=1,lty=1,type='b',col=i,lwd=2)
for (i in 1:5) points(0:5,100*invlogit(unwt.fit)[1:6+(i-1)*6],pch=2,lty=2,type='b',col=i,lwd=2)

legend(3.3,67,c('10% female' ,'30% female',' 50% female', '70% female',' 90% female'),col=1:5,pch=rep(1,5),lty=rep(1,5),bty='n',cex=.8)
legend(4.5,67,c('' ,'',' ', '',''),col=1:5,pch=rep(2,5),lty=rep(2,5),bty='n',cex=.8)
text(3.5,69,'weighted',cex=.8)
text(4.7,69,'unweighted',cex=.8)
#################################################################