Author: Annika Tillander, 2014-01-30
Edited: Andreas Karlsson, 2015-03-01
Load the diet data using time-on-study as the timescale.
You may have to install the required packages the first time you use them. You can install a package by install.packages("package_of_interest")
for each package you require.
require(foreign) # for reading data set from Stata
require(survival) # for Surv and survfit
require(epiR) # for epi.conf
require(dplyr) # for data manipulation
Load diet data and explore it.
diet <- data.frame(read.dta("http://biostat3.net/download/diet.dta"))
head(diet)
summary(diet)
(a) Tabulate CHD incidence rates per 1000 person-years for each category of hieng. Calculate (by hand) the ratio of the two incidence rates.
diet$y1k <- diet$y/1000
diet.ir6a <- diet %>%
group_by(hieng) %>%
summarise(Event = sum(chd), Time = sum(y1k)) %>% # group sums
cbind(select(., Event, Time) %>% # formating for epi.conf
as.matrix() %>% # formating for epi.conf
epi.conf(ctype="inc.rate", method="exact")) %>% # calc rate + CI
print()
## hieng Event Time est lower upper
## 1 low 28 2.059431 13.595992 9.430643 19.64999
## 2 high 18 2.544238 7.074809 4.496136 11.18125
## IRR
diet.ir6a[diet.ir6a$hieng=="high","est"] / diet.ir6a[diet.ir6a$hieng=="low","est"]
## [1] 0.5203599
(b) Fit a poisson model to find the incidence rate ratio for the high energy group compared to the low energy group and compare the estimate to the one you obtained in the previous question.
poisson6b <- glm( chd ~ hieng + offset( log( y1k ) ), family=poisson, data=diet)
summary(poisson6b)
##
## Call:
## glm(formula = chd ~ hieng + offset(log(y1k)), family = poisson,
## data = diet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7382 -0.6337 -0.4899 -0.3891 3.0161
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.6098 0.1890 13.811 <2e-16 ***
## hienghigh -0.6532 0.3021 -2.162 0.0306 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 262.82 on 336 degrees of freedom
## Residual deviance: 258.00 on 335 degrees of freedom
## AIC: 354
##
## Number of Fisher Scoring iterations: 6
exp(cbind(coef(poisson6b),confint(poisson6b)))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 13.5959916 9.1614552 19.2715805
## hienghigh 0.5203599 0.2829171 0.9328392
(c) Grouping the values of total energy into just two groups does not tell us much about how the CHD rate changes with total energy. It is a useful exploratory device, but to look more closely we need to group the total energy into perhaps 3 or 4 groups.
In this example we shall use the cut points 1500, 2500, 3000, 4500. Compare with a normal distribution to check if these cutpoints seem reasonable.
hist6c <- hist(diet$energy, breaks=25, probability=TRUE)
curve(dnorm(x, mean=mean(diet$energy), sd=sd(diet$energy)), col = "red", add=TRUE)
quantile(diet$energy, probs=c(0.01,0.05,0.1,0.25,0.5,0.75,0.90,0.95,0.99))
## 1% 5% 10% 25% 50% 75% 90% 95%
## 1887.268 2177.276 2314.114 2536.690 2802.980 3109.660 3365.644 3588.178
## 99%
## 4046.820
# For kurtosis and skewness, see package e1071
(d) Create a new variable eng3 coded 1500 for values of energy in the range 1500–2499, 2500 for values in the range 2500–2999, and 3000 for values in the range 3000–4500.
diet$eng3 <- cut(diet$energy, breaks=c(1500,2500,3000,4500),labels=c("low","medium","high"))
diet %>% group_by(eng3) %>%
summarise(Freq = n(), Percent = n()/dim(diet)[1]) %>%
mutate(Cum = cumsum(Percent))
## Source: local data frame [3 x 4]
##
## eng3 Freq Percent Cum
## 1 low 75 0.2225519 0.2225519
## 2 medium 150 0.4451039 0.6676558
## 3 high 112 0.3323442 1.0000000
(e) Estimate the rates for different levels of eng3. Calculate (by hand) the ratio of rates in the second and third levels to the first level.
## IRR
diet.ir6e <- diet %>%
group_by(eng3) %>%
summarise(Event = sum(chd), Time = sum(y1k)) %>% # group sums
cbind(select(., Event, Time) %>% # formating for epi.conf
as.matrix() %>% # formating for epi.conf
epi.conf(ctype="inc.rate", method="exact")) %>% # calc rate + CI
print()
## eng3 Event Time est lower upper
## 1 low 16 0.9466338 16.901995 10.461412 27.447781
## 2 medium 22 2.0172621 10.905871 7.227631 16.511619
## 3 high 8 1.6397728 4.878725 2.509722 9.613033
diet.ir6e[diet.ir6e$eng3=="medium","est"]/diet.ir6e[diet.ir6e$eng3=="low","est"]
## [1] 0.6452416
diet.ir6e[diet.ir6e$eng3=="high","est"]/diet.ir6e[diet.ir6e$eng3=="low","est"]
## [1] 0.2886479
(f) Create your own indicator variables for the three levels of eng3.
dummies6f <- model.matrix(~eng3,data=diet)
diet$X2 <- dummies6f[,"eng3medium"]
diet$X3 <- dummies6f[,"eng3high"]
diet$X1 <- (1-diet$X2)*(1-diet$X3)
colSums(diet[c("X1","X2","X3")])
## X1 X2 X3
## 75 150 112
(g) Check the indicator variables.
head(diet[which(diet$eng3=="low"),c("energy","eng3","X1","X2","X3")])
## energy eng3 X1 X2 X3
## 1 2023.25 low 1 0 0
## 2 2448.68 low 1 0 0
## 3 2281.38 low 1 0 0
## 4 2467.95 low 1 0 0
## 5 2362.93 low 1 0 0
## 6 2380.11 low 1 0 0
head(diet[which(diet$eng3=="medium"),c("energy","eng3","X1","X2","X3")])
## energy eng3 X1 X2 X3
## 76 2664.64 medium 0 1 0
## 77 2533.33 medium 0 1 0
## 78 2854.08 medium 0 1 0
## 79 2673.77 medium 0 1 0
## 80 2766.88 medium 0 1 0
## 81 2586.69 medium 0 1 0
head(diet[which(diet$eng3=="high"),c("energy","eng3","X1","X2","X3")])
## energy eng3 X1 X2 X3
## 226 3067.36 high 0 0 1
## 227 3298.95 high 0 0 1
## 228 3147.60 high 0 0 1
## 229 3180.47 high 0 0 1
## 230 3045.81 high 0 0 1
## 231 3060.03 high 0 0 1
(h) Use poisson to compare the second and third levels with the first.
poisson6h <- glm( chd ~ X2 + X3 + offset( log( y1k ) ), family=poisson, data=diet )
summary( poisson6h)
##
## Call:
## glm(formula = chd ~ X2 + X3 + offset(log(y1k)), family = poisson,
## data = diet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8231 -0.6052 -0.4532 -0.3650 2.9434
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.8274 0.2500 11.312 < 2e-16 ***
## X2 -0.4381 0.3285 -1.334 0.18233
## X3 -1.2425 0.4330 -2.870 0.00411 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 262.82 on 336 degrees of freedom
## Residual deviance: 253.62 on 334 degrees of freedom
## AIC: 351.62
##
## Number of Fisher Scoring iterations: 6
exp(cbind(coef(poisson6h),confint(poisson6h)))
## 2.5 % 97.5 %
## (Intercept) 16.9019960 9.9133301 26.5865487
## X2 0.6452416 0.3407138 1.2492291
## X3 0.2886478 0.1170344 0.6561826
Compare your estimates with those you obtained in part 6e. Write the linear predictor using pen and paper.
(i) Use a poisson model to compare the first and third levels with the second. Write the linear predictor using pen and paper.
poisson6i <- glm( chd ~ X1 + X3 + offset( log( y1k ) ), family=poisson, data=diet )
summary( poisson6i )
##
## Call:
## glm(formula = chd ~ X1 + X3 + offset(log(y1k)), family = poisson,
## data = diet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8231 -0.6052 -0.4532 -0.3650 2.9434
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.3893 0.2132 11.207 <2e-16 ***
## X1 0.4381 0.3285 1.334 0.1823
## X3 -0.8044 0.4129 -1.948 0.0514 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 262.82 on 336 degrees of freedom
## Residual deviance: 253.62 on 334 degrees of freedom
## AIC: 351.62
##
## Number of Fisher Scoring iterations: 6
exp(cbind(coef(poisson6i),confint(poisson6i)))
## 2.5 % 97.5 %
## (Intercept) 10.9058706 6.9598182 16.1181547
## X1 1.5498071 0.8004937 2.9350148
## X3 0.4473485 0.1868711 0.9655319
(j) Repeat the analysis comparing the second and third levels with the first but this time have R create the indicators automatically y using the categorical variable eng3.
poisson6j <- glm( chd ~ eng3 + offset( log( y1k ) ), family=poisson, data=diet )
summary( poisson6j )
##
## Call:
## glm(formula = chd ~ eng3 + offset(log(y1k)), family = poisson,
## data = diet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8231 -0.6052 -0.4532 -0.3650 2.9434
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.8274 0.2500 11.312 < 2e-16 ***
## eng3medium -0.4381 0.3285 -1.334 0.18233
## eng3high -1.2425 0.4330 -2.870 0.00411 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 262.82 on 336 degrees of freedom
## Residual deviance: 253.62 on 334 degrees of freedom
## AIC: 351.62
##
## Number of Fisher Scoring iterations: 6
exp(cbind(coef(poisson6j),confint(poisson6j)))
## 2.5 % 97.5 %
## (Intercept) 16.9019960 9.9133301 26.5865487
## eng3medium 0.6452416 0.3407138 1.2492291
## eng3high 0.2886478 0.1170344 0.6561826
(k) Calculate the total number of events during follow-up, person-time at risk, and the crude incidence rate (per 1000 person-years)
sums6k <- colSums(diet[c("chd","y")])
sums6k[1]/sums6k[2]
## chd
## 0.009992031