##### Exercise 6.2 in Jong og Heller
NSWDeaths2002 <- read.table(file="http://www.uio.no/studier/emner/matnat/math/STK3100/data/nswdeaths2002_stk3100.csv",header=T,sep=",")
NSWDeaths2002 # Gender(faktor) Age(faktor) Deaths(respons) Population(offsett)
#Plot the data to check if it makes sense to model a linear relationship between the response and the covariate age.
par(mfrow=c(1,3))
#plot deaths
plot(NSWDeaths2002$Age[NSWDeaths2002$Gender == "Male"], NSWDeaths2002$Deaths[NSWDeaths2002$Gender == "Male"], ylim=c(0,9000), ylab="# deaths", xlab="age")
points(NSWDeaths2002$Age[NSWDeaths2002$Gender == "Female"], NSWDeaths2002$Deaths[NSWDeaths2002$Gender == "Female"], col="red")
legend("topleft",legend=c("male","female"),col=c("black","red"),lty=1)
#plot death rate per person
plot(NSWDeaths2002$Age[NSWDeaths2002$Gender == "Male"], (NSWDeaths2002$Deaths/NSWDeaths2002$Population)[NSWDeaths2002$Gender == "Male"], ylab="death rate per person", xlab="age")
points(NSWDeaths2002$Age[NSWDeaths2002$Gender == "Female"], (NSWDeaths2002$Deaths/NSWDeaths2002$Population)[NSWDeaths2002$Gender == "Female"], col="red")
legend("topleft",legend=c("male","female"),col=c("black","red"),lty=1)
#plot log(death rate per person)
plot(NSWDeaths2002$Age[NSWDeaths2002$Gender == "Male"], log(NSWDeaths2002$Deaths/NSWDeaths2002$Population)[NSWDeaths2002$Gender == "Male"], ylim=c(-8,-2), ylab="log death rate", xlab="age")
points(NSWDeaths2002$Age[NSWDeaths2002$Gender == "Female"], log(NSWDeaths2002$Deaths/NSWDeaths2002$Population)[NSWDeaths2002$Gender == "Female"], col="red")
legend("topleft",legend=c("male","female"),col=c("black","red"),lty=1)
# now carry out model selection with respect to AIC = -2l + 2p
# Reasonable Model: Deaths is poiss(my), Population is offsett, link func is log.
# Another reasonable model is binom(ni,pi), link is logit or probit
NSWDeaths2002$numalder=rep(c(20, 30, 40, 50, 60, 70, 80, 90), times=2)
NSWDeaths2002$Alive=NSWDeaths2002$Population-NSWDeaths2002$Deaths
# extensions use other links for them
#################################
#What type of covariates do we need?
#Use AIC as model selection criteria (best model gives lowest AIC):
### age and gender as categorical variables:
fit.0 = glm(Deaths~offset(log(Population)),family=poisson,data=NSWDeaths2002);AIC(fit.0)
fit.g = glm(Deaths~offset(log(Population))+Gender,family=poisson,data=NSWDeaths2002);AIC(fit.g)
fit.a = glm(Deaths~offset(log(Population))+Age,family=poisson,data=NSWDeaths2002);AIC(fit.a)
fit.ag = glm(Deaths~offset(log(Population))+Age+Gender,family=poisson,data=NSWDeaths2002);AIC(fit.ag)
#CONK: both age(as categorical) and gender should be included. lowest AIC = 426.4555
### age as numerical:
NSWDeaths2002$numalder=rep(c(20, 30, 40, 50, 60, 70, 80, 90), times=2) #use the midpoint in each age-group as age variable
fit.na = glm(Deaths~offset(log(Population)) +numalder, family=poisson,data=NSWDeaths2002);AIC(fit.na)
fit.nana2 = glm(Deaths~offset(log(Population)) +numalder + I(numalder^2), family=poisson,data=NSWDeaths2002);AIC(fit.nana2)
fit.nag = glm(Deaths~offset(log(Population)) +numalder + Gender, family=poisson,data=NSWDeaths2002);AIC(fit.nag)
fit.nana2g = glm(Deaths~offset(log(Population)) +numalder + I(numalder^2) +Gender,family=poisson,data=NSWDeaths2002);AIC(fit.nana2g)
#CONK: gender and age as a second degree polynomial. lowest AIC= 493.6268, but this model is not as good as fit.ag !
# let us now carry out some model selection by means of random walk across the space of potential solution
N<-1000 # define number of steps in the walk
maxpow<-4 #define maximum polinomial degree for the covariates to be included
maxnum<-8 #define maximal amount of the covariates in the model (including the polinomials and intersections)
covn<-3 # define total number of non-intersecting original covariates (age, age group, gender)
names(NSWDeaths2002)[[1]]<-"X1"
names(NSWDeaths2002)[[5]]<-"X2"
names(NSWDeaths2002)[[2]]<-"X3"
NSWDeaths2002$X0 <- 1
NSWDeaths2002$X1 <- 1*(NSWDeaths2002$X1 == "Male")
# now carry out random walk search across the models to find the best one in terms of AIC,
# note that simulated annealeng, genetic search, treshold acceptance, tabu search, greedy search with multiple restarts, mcmc
# and other search methods for resolving NP-hard combinatorial optimization problem COP can be used instead!
fit.best<-NULL
aic.best<-Inf
aic.interest<-Inf
for(i in 1:N)
{
p<-round(runif(n = 1,min = 1,max = maxnum)) # generate the number of covariates to be included at the current step
idspow<-NULL
covobs<-"1"
if(runif(n = 1,min = 0,max = 1)>0.5) # randomize if the constant (intercept should be included)
covobs<-"-1"
tt <- tryCatch({
for(i in 1:p)
{
idspow<-cbind(idspow,c(round(runif(n=1,min = 0,max = covn)),round(runif(n = 1,min = 0,max = covn)),round(runif(n = 1,min = 1,max = maxpow)),round(runif(n = 1,min = 1,max = maxpow)))) # generate the covariates, intercections and powers
# note that we should not bother about colinearity of the covariates on our own,
# since glm automatically chooses the independent set if some linear dependence occures
# and fills all the rest with NA
if(idspow[[3,i]]!=1 && idspow[[4,i]]!=1) #define a formula for glm w.r.t. the chosen covariates, intersections and degrees of power
{
covobs<-paste(covobs,"+","I(","X",idspow[[1,i]],")","^",idspow[[3,i]],":I(","X",idspow[[2,i]],")^",idspow[[4,i]],sep = "")
}
else if(idspow[[3,i]]==1 && idspow[[4,i]]!=1)
{
covobs<-paste(covobs,"+","I(","X",idspow[[1,i]],")",":I(","X",idspow[[2,i]],")^",idspow[[4,i]],sep = "")
}
else if(idspow[[3,i]]!=1 && idspow[[4,i]]==1)
{
covobs<-paste(covobs,"+","I(","X",idspow[[1,i]],")","^",idspow[[3,i]],":I(","X",idspow[[2,i]],")",sep = "")
}
else
{
covobs<-paste(covobs,"+","I(","X",idspow[[1,i]],")",":I(","X",idspow[[2,i]],")",sep = "")
}
}
if(runif(n = 1,min = 0,max = 1)>0.5) # randomize if a binomial or poisson regression should be used at a current step
{
formula <- as.formula(paste("cbind(Deaths,Alive)"," ~ ",covobs)) # link covariates and responses within a formula object for binomial responses (successes, failures)
if(runif(n = 1,min = 0,max = 1)>0.5) # randomize if a logit or probit link should get used in the current iteration
{
fit.0 = glm(formula,family=binomial(link = "logit"),data=NSWDeaths2002)
aic.cur<-AIC(fit.0)
} else {
fit.0 = glm(formula,family=binomial(link = "probit"),data=NSWDeaths2002)
aic.cur<-AIC(fit.0)
}
} else {
formula <- as.formula(paste("Deaths"," ~ ","offset(log(Population)) + ",covobs)) # link covariates and responses within a formula object poisson responses with offset
fit.0 = glm(formula,family=poisson(link = "log"),data=NSWDeaths2002)
aic.cur<-AIC(fit.0)
}
},error=function(e) e, warning=function(w) w)
if(aic.best>aic.cur) # update the best found solution
{
print(paste("UPDATE GLOBAL AIC",aic.cur))
fit.best<-fit.0
aic.best<-aic.cur
} else if(aic.best/aic.cur>=0.80 && aic.best!=aic.cur && aic.cur!=aic.interest) # just get the feeling about other inetersing solutions and if they exist
{
print(paste("an interesting solution less that 20% away from the current optimal one ",aic.cur))
aic.interest <- aic.cur
}
}
# print the best found solution
print(aic.best)
summary(fit.best)
# the best I managed to find in total so far is (seems to be the best of what we can do for this dataset in general)
fit.ag.INTERag = glm(Deaths~offset(log(Population))+X3+X1 + I(X3):I(X1),family=poisson,data=NSWDeaths2002)
AIC(fit.ag.INTERag)
# Conk: age(as categorical), gender and Interaction. AIC: 179.1836 (in total the best model)
summary(fit.ag.INTERag)