###############################################
# HOW TO USE THIS FILE
# To submit one line at a time just press Ctrl+R and the line
# where the pointer is will be sent to R
#
# To submit more lines highlight them and press Ctrl+R
#
######### ANY TEXT AFTER THE # CHARACTER ARE TREATED AS COMMENTS
#
# data stored in c:/GR/aprojects/peas/ex1datafiles/data/.Rdata
# this file is c:/GR/aprojects/peas/ex1datafiles/program_code/ex1.R
#
##############################################
#
# data are in a data frame frs
# to get the names of the variables
#
names(frs)
#
# first find the simple mean of the income variable - no weights
#
mean(frs$HHINC)
#
# and then the weighted mean - go to the html help file if you want to check how this works
#
weighted.mean(frs$HHINC,frs$GROSS2)
###########################################################################
# The next line makes the plot shown on the web page and also 
# examines the distribution of the weights
########################################################################
par(omi=c(.1,.1,0,.1))# sets outer margins
boxplot(split(frs$GROSS2,frs$ADULTH),col='red',cex.axis=0.7,cex.lab=0.7,las=2,ylab='Grossing-up weight')
#
# now it is time to try using the survey package
#
# If you have not already done so you will need to 
# download the survey package 
# (>>pakages menu >> install package from cran>> select a mirror near you
#               and pick the survey pakage from the list)
# then to give R access to the survey functions do
#
library(survey)
############################################################
#
# The first step is to create a SURVEY DESIGN OBJECT that 
# will contain both the data and the information about the design
##########################################

frs.des <- svydesign(id=~PSU, weights=~GROSS2,data=frs)
summary(frs.des)
############################################
# see the html help file for the function svydesign for more explanation
# the ~ sign refers to it being one of the columns of the data
# 
# the R object frs.des now contains all the information about the design
# including the data
#
# But first let us examine the effect of weighting and clustering 
# on the precision of the mean weekly income
#
#  First get the standard error and design effect for the mean income
#   allowing for weighting  and clustering
###################################################################

svymean(~HHINC,design=frs.des,deff=T)


#######################################################################
# now  reset the design so that only weighting is allowed
# for. To do this the PSU must be set to the individual case identifier (SERNUM)
###########################################################################

frs.des <- svydesign(id=~SERNUM, weights=~GROSS2,data=frs)
svymean(~HHINC,design=frs.des,deff=T)


##########################################################
# next we  get ready to post-stratify this survey to match the population totals on 
# council tax band (CTBAND) and tenure (TENTYPE)
# read in totals
# by using the R function xtabs 
# to create tables of totals by CTBAND and TENTYPE
###########################################################

tab.ctband <- xtabs(GROSS2~CTBAND,data=frs)
tab.ctband[1:9]<-c( 24.83 24.62 15.45 11.91 11.96 5.95 3.94 0.45 0.89)*sum(frs$GROSS2)/100
unclass(tab.ctband)

tab.tenure <- xtabs(GROSS2~TENURE,data=frs)
tab.tenure[1:4]<-c(  62.63 , 21.59, 5.58 , 10.20)*sum(frs$GROSS2)/100
unclass(tab.ctband)
#
# now we rake the original survey to match these totals
#
frs.des <- svydesign(id=~PSU, weights=~WEIGHTA,data=frs)
frs.des<-rake(frs.des,list(~CTBAND,~TENURE),list(tab.ctband,tab.tenure))
#
#
# 
svymean(~HHINC,design=frs.des,deff=T)
###########################################################
# R uses a method called calibration weighting 
# to get the standard errors for a raked survey
# see the help on raking
# It is not available for any other program
#  featured on this site, but SUDAAN does it
# An alternative is to use replication methods
###################################################################
# There are TWO kinds of survey design
# 
# 1. Using simple methods , as used above
#       usually for surveys WITHOUT POST-STRATIFICATION
# 2. Using replicate weights, usually for surveys with POST-STARTIFICATION
#
# the next stage is to convert the simple design into one with replicate weights
# we use the default options which produce jacknife samples
# see html help for as.svrepdesign
# Go back to the original design and convert it to a 
# replicate one and then post-stratify it
#######################################################
frs.des <- svydesign(id=~PSU, weights=~GROSS2,data=frs)
frs.des <- as.svrepdesign(frs.des)

frs.des<-rake(frs.des,list(~CTBAND,~TENURE),list(tab.ctband,tab.tenure))

svymean(~HHINC,design=frs.des,deff=T)

#####################################################################
# notice that we have overwritten the previous R object with the new one 
# this is to save space as each one contains the data and the work space
# can get too big for comfort
#################################################################

##
#########################################################################################
# NOW GETTING THE PERCENTILES OF THE INCOME DISTRIBUTION
#
my.svrepquantile(~HHINC,design=frs.des,quantile=c(0.05,0.1,.25,.5,.75,.9,.95))
#
# NEED TO USE THIS JUST NOW (source the file myRfunctions.R
# go to R package main page for link to it
# because the survey package one is not working right
# for replicate designs
#
#svyquantile(~HHINC,design=frs.des,quantile=c(0.05,0.1,.25,.5,.75,.9,.95),ci=T)
#

####################################################################################
#
# INCOME FOR LONE PARENTS
#
# the next command will give a table of the data by household type
#
xtabs(~ADULTH+DEPCHLDH,data=frs)
frs$LONEP<-0
frs$LONEP[frs$ADULTH==1 & frs$DEPCHLDH>0]<-1
xtabs(~LONEP+DEPCHLDH,data=frs)
sum(frs$LONEP)
#
# you will see that there are just 334 single parent households
# we want the mean household income and its standard error
# To do this we need to subset the survey (linkto theory)
# In R this means creating a new subset design object
# Need to reset the design so that the new variable LONEP 
# is there
#
frs.des <- svydesign(id=~PSU, weights=~GROSS2,data=frs)
frs.des.lonep<-subset.survey.design(frs.des,LONEP==1)
#
# and then apply all the survey functions to this new design
#
svymean(~HHINC,frs.des.lonep,deff=T)
#
# as expected single parent incomes are lower than the rest of the population
# 
# Now with poststratification
#
frs.des <- svydesign(id=~PSU, weights=~GROSS2,data=frs)
frs.des <- as.svrepdesign(frs.des)
frs.des<-rake(frs.des,list(~CTBAND,~TENURE),list(tab.ctband,tab.tenure))
frs.des.lonep<-subset.survey.design(frs.des,LONEP==1)
svymean(~HHINC,frs.des.lonep,deff=T)