##############################################
#
# HOW TO USE THIS FILE
# A text version of this file (ex3.R) is also available which can just be sourced
# into the R program
# This file is intended for you to use interactively by pasting into R
# Comments are in black - red bits are commands that can be pasted into R
#
#
######### ANY TEXT AFTER THE # CHARACTER ARE TREATED AS COMMENTS
#
# data stored in c:/GR/aprojects/peas/web/exemp3/data/ex4.Rdata
# this file is c:/GR/aprojects/peas/web/exemp3/program_code/ex4_R.html
#
##############################################
#
# data are in a data frame health
# to get the names of the variables
#
names(wers)
#
# the ethnic group percentages are in a column labelled ETHNIC
# to get a histogram of this variable do
#
hist(wers$ETHNIC)
#
# now it is time to try using the survey package
# to give R access to the survey functions do - see mini guide if this fails
#
library(survey)
#
# The first step is to create a SURVEY DESIGN OBJECT that
# will contain both the data and the information about the design
#
# This design has WEIGHTS (EST_WT),
# and it is stratified by SIZEBAND and ?????
# Since it is not clustered we set the sampling unit ids
# to be the serial number SERNO, different for each workplace
#
wers.des <- svydesign(id=~SERNO, strata=~STRATA,weights=~EST_WT,data=wers)
#
# we can look at the survey weighted mean of the percentages
# of females, disabled people and those from ethnic minorities
# as follows though in each case we need to select the subset of the design
# without missing values (!is.na means not missing)
#
svymean(~FEMALE,subset(wers.des,!is.na(FEMALE)),deff=T)
svymean(~DISAB,subset(wers.des,!is.na(DISAB)),deff=T)
svymean(~ETHNIC,subset(wers.des,!is.na(ETHNIC)),deff=T)
#
# in order to compare means for groups in R for employers
# with equal opps policies and those without
# we need to run a linear model with the glm command
# or its survey equivalent svyglm
#
summary(svyglm(FEMALE~EO,wers.des))
summary(glm(FEMALE~EO,data=wers))
#
# both analyses show many much higher rates of female employees when
# workplace has an equal opportunities policy
#
# But we find a very large difference between the weighted and unweighted differences
# in % female 11% unweighted versus 18% weighted
# this is confirmed by computing the subgroup means with the commands below
# that also annoyingly need lots of code to sort out missing values
#
mean(wers$FEMALE[wers$EO==0 & !is.na(wers$EO)& !is.na(wers$FEMALE) ])
mean(wers$FEMALE[wers$EO==1 & !is.na(wers$EO)& !is.na(wers$FEMALE) ])
weighted.mean(wers$FEMALE[wers$EO==0 & !is.na(wers$EO)& !is.na(wers$FEMALE) ],wers$EST_WT[wers$EO==0 & !is.na(wers$EO)& !is.na(wers$FEMALE) ])
weighted.mean(wers$FEMALE[wers$EO==1 & !is.na(wers$EO)& !is.na(wers$FEMALE) ],wers$EST_WT[wers$EO==1 & !is.na(wers$EO)& !is.na(wers$FEMALE) ])
#
# now looking at disabled
#
summary(svyglm(DISAB~EO,wers.des))
summary(glm(DISAB~EO,data=wers))
#
# and ethnic groups
# Very small differences here, low rates overall and no evidence of differences
#
summary(svyglm(ETHNIC~EO,wers.des))
summary(glm(ETHNIC~EO,data=wers))
#
# like females a bigger effect in the adjusted analysis
# but note larger standard error in the survey adjusted analysis
# but clea evidence of more ethnic minority employees when policy in place
#
# Now looking at the question the other way round
# what predictes an equal opportiunities policy
#
# R does not do stepwise methods (some of us think this is good)
# so we need to decide our own strategy starting with putting everything in
#
summary(svyglm(EO~ETHNIC+FEMALE+DISAB+NEMPSIZE,wers.des,family='binomial'))
#
# clearly size of establishment is the biggest predictor
# closely followed by percent female
# ethnic minorities also contributing a bit
# but disabled employees having no effect
#
# next step drops the disabled variable and looks at female and
# size in groups to check linearity
#
wers.des<-update(wers.des,GPSIZE=factor(NEMPSIZE))
wers.des<-update( wers.des,GPFEMALE=factor(ceiling(FEMALE/20)*(FEMALE>0)+(1*FEMALE==0),
labels=c('0-<20','20-<40','40-<60','60-<80','80-100') ) )
summary(svyglm(EO~ETHNIC+GPFEMALE+GPSIZE,wers.des,family='binomial'))
#
# we can see that linearlity looks not to bad for female, but there
# is a tep function up at groups 7 and 8 for size of establishment
# this suggests a model with an extra step function for size
# of workplace
#
wers.des<-update( wers.des,BIG=factor(NEMPSIZE>6,labels=c('size 6 or under','sizes 7 and 8') ))
#
#
summary(svyglm(EO~ETHNIC+FEMALE+NEMPSIZE+BIG,wers.des,family='binomial'))
#
# this shows the expected effects with the large
# size having the biggest effect
# It is now worth checking for any interaction between BIG and FEMALE
#
summary(svyglm(EO~ETHNIC+FEMALE+NEMPSIZE+BIG*FEMALE*BIG,wers.des,family='binomial'))
#
# we can see that the effect of % of female employees is less marked in the bigger workplaces
# note also that the efect of ethnic group still stands out clearly in these
# analyses with the odds of EO increasing by around 1.8 (exp(0.3*20)) when the
# percentage ethnic goes from 0 to 20%.
#
# This final bit of code makes a plot of %EO by FEMALE and SIZE to understand findings.
#
eo<-wers.des$variables$EO
gpfemale<-wers.des$variables$GPFEMALE[!is.na(eo)]
big<-wers.des$variables$BIG[!is.na(eo)]
eo<-eo[!is.na(eo)]
fits<-tapply(eo,list(big,gpfemale),mean)*100
barplot(fits,beside=T,main='%EO policy by %female and size of workplace',
sub='%female groups red=small workplace yellow= large workplace')
lines(c(0,100),c(100,100))
lines(c(0,100),c(0,0))