Biostat III exercises in R

Laboratory exercise 25

Suggested solutions by

Author: Bénédicte Delcoigne, 2015-03-06

Localised melanoma : generating and analysing a nested case-control (NCC) study


You may have to install the required packages the first time you use them. You can install a package by install.packages("package_of_interest") for each package you require.

require(foreign)  # for reading data set from Stata
require(survival) # cox and conditional logistic analyses
require(Epi)      # sample a nested case-control study from a cohort

Load the melanoma data. Restrict the data to the localised status of the melanoma and to the 10 firts years of follow-up. Use the time-on-study as the timescale. define the event as death from cancer.

## Get the data for exercise 25 and have a look at it
mel <- read.dta("http://biostat3.net/download/melanoma.dta")
head(mel)
mel <- subset(mel , stage=="Localised")               # restrict the cohort to stage==1
num_dupli  <- length(mel$id) - length(unique(mel$id)) # check if one line is one individual
num_dupli                                             # will be 0  if true
str(mel$status)                                       # check the structure of "status"
table (mel$status, useNA="always")                    # check the categories of "status"
mel$dc <- 0                                           # create an indicator for the event "Dead: cancer"
mel$dc[mel$status=="Dead: cancer"] <- 1               # dc == 1 if "Dead: Cancer" ; otherwise 0
table (mel$dc, useNA="always")                        # check
mel$surv_10y  <- ifelse( mel$surv_mm < 120 , mel$surv_mm, 120)   # restrict follow-up to 120 months
dim(mel[mel$surv_mm >= 120 & mel$dc==1,])             # how many cases after 120 months ?
mel[mel$surv_mm >= 120 & mel$dc==1,]$dc  <- 0         # events occuring after 120 months are changed to status = 0
table(mel$dc , mel$status)                            # check

The Cox proportional hazard analysis is done thanks to the “coxph” command in the “survival” package. It is often a good idea to check first the structure of the variables.

str(mel$sex) ; str(mel$year8594) ; str(mel$agegrp)    #Check structure of risk factors/confounders
##  Factor w/ 2 levels "Male","Female": 2 2 2 2 2 2 2 1 2 2 ...
##  Factor w/ 2 levels "Diagnosed 75-84",..: 1 1 1 1 1 1 1 1 1 1 ...
##  Factor w/ 4 levels "0-44","45-59",..: 4 4 4 4 4 4 4 4 4 4 ...
out_coh <- coxph(Surv(surv_10y,dc) ~ sex + year8594 + agegrp, data = mel, ties="breslow" )
summary(out_coh)
## Call:
## coxph(formula = Surv(surv_10y, dc) ~ sex + year8594 + agegrp, 
##     data = mel, ties = "breslow")
## 
##   n= 5318, number of events= 960 
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)    
## sexFemale               -0.52964   0.58881  0.06545 -8.092 5.55e-16 ***
## year8594Diagnosed 85-94 -0.33284   0.71688  0.06618 -5.029 4.93e-07 ***
## agegrp45-59              0.28247   1.32640  0.09417  2.999   0.0027 ** 
## agegrp60-74              0.61914   1.85732  0.09088  6.813 9.56e-12 ***
## agegrp75+                1.21570   3.37265  0.10444 11.641  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## sexFemale                  0.5888     1.6983    0.5179    0.6694
## year8594Diagnosed 85-94    0.7169     1.3949    0.6297    0.8162
## agegrp45-59                1.3264     0.7539    1.1028    1.5953
## agegrp60-74                1.8573     0.5384    1.5543    2.2194
## agegrp75+                  3.3727     0.2965    2.7484    4.1387
## 
## Concordance= 0.646  (se = 0.01 )
## Rsquare= 0.039   (max possible= 0.949 )
## Likelihood ratio test= 211.9  on 5 df,   p=0
## Wald test            = 217.1  on 5 df,   p=0
## Score (logrank) test = 225.9  on 5 df,   p=0

(a) How many individuals are in the study?

n_ind <- length(mel$id)
n_ind
## [1] 5318

(b) How many experience the event?

table(mel$dc, useNA="always")
## 
##    0    1 <NA> 
## 4358  960    0
ncase <-  table (mel$dc, useNA="always")[2]
ncase
##   1 
## 960

(c1) Generate a nested case-control study with 1 control per case. Cases and controls are matched on time since entry. This is done by using the function “cwcc” in the Epi package. Note that in the codes provided here, the variables “dc” and “id” are not necessary. They permit however to understand how the data are handled as for eample: how many individuals are sampled several times or how many cases happened to be sampled as controls before their failure time.

nccdata <-ccwc( entry=0, exit=surv_10y , fail=dc, origin=0, controls=1, include=list(sex,year8594,agegrp,dc,id), data=mel )
## 
## Sampling risk sets: .....................................................................................................................
tail(nccdata, 8)
##      Set  Map  Time Fail    sex        year8594 agegrp dc   id
## 1913 114 2674  87.5    1 Female Diagnosed 85-94   0-44  1 3784
## 1914 114 1806  87.5    0   Male Diagnosed 75-84  45-59  0 2532
## 1915 115 4275  98.5    1   Male Diagnosed 85-94  60-74  1 6168
## 1916 115  841  98.5    0   Male Diagnosed 75-84  45-59  0 1194
## 1917 116 5287   2.5    1   Male Diagnosed 85-94  45-59  1 7713
## 1918 116 1692   2.5    0 Female Diagnosed 75-84    75+  0 2381
## 1919 117 5314 119.5    1   Male Diagnosed 85-94  60-74  1 7753
## 1920 117 1591 119.5    0 Female Diagnosed 75-84  60-74  0 2234

(c2) Analyse the nested case-control data (with the survival package or the Epi package) and function “clogit”. As the ncc data was generated with the Epi package, we use the syntax of this package.

out_ncc <- clogit(Fail ~ sex + year8594 + agegrp + strata(Set), data=nccdata)
summary(out_ncc)
## Call:
## coxph(formula = Surv(rep(1, 1920L), Fail) ~ sex + year8594 + 
##     agegrp + strata(Set), data = nccdata, method = "exact")
## 
##   n= 1920, number of events= 960 
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)    
## sexFemale               -0.39811   0.67159  0.09570 -4.160 3.18e-05 ***
## year8594Diagnosed 85-94 -0.24510   0.78262  0.09454 -2.593  0.00952 ** 
## agegrp45-59              0.29070   1.33736  0.12563  2.314  0.02067 *  
## agegrp60-74              0.63077   1.87906  0.12613  5.001 5.70e-07 ***
## agegrp75+                1.21874   3.38292  0.16389  7.436 1.03e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## sexFemale                  0.6716     1.4890    0.5567    0.8101
## year8594Diagnosed 85-94    0.7826     1.2778    0.6503    0.9419
## agegrp45-59                1.3374     0.7477    1.0455    1.7107
## agegrp60-74                1.8791     0.5322    1.4675    2.4060
## agegrp75+                  3.3829     0.2956    2.4535    4.6644
## 
## Rsquare= 0.04   (max possible= 0.701 )
## Likelihood ratio test= 78.3  on 5 df,   p=1.887e-15
## Wald test            = 73.7  on 5 df,   p=1.732e-14
## Score (logrank) test = 76.72  on 5 df,   p=4.108e-15

(d) How many unique individuals are in our study?

n_uni <- length(unique(nccdata$id))
n_uni
## [1] 1721

(e) Compare the estimated parameters and standard errors between the full cohort anal- ysis and the nested case-control study. What is the relative efficiency (ratio of vari- ances) of the nested case-control compared to the full cohort design?

comp <- data.frame(summary(out_coh)$coef[c(1:5),2],summary(out_ncc)$coef[c(1:5),2])
colnames(comp) <- c("cohort HR", "NCC HR")
comp                           # print the HR estimates from the cohort and the NCC
##                         cohort HR    NCC HR
## sexFemale               0.5888144 0.6715855
## year8594Diagnosed 85-94 0.7168836 0.7826237
## agegrp45-59             1.3263971 1.3373644
## agegrp60-74             1.8573227 1.8790626
## agegrp75+               3.3726523 3.3829215
var <- data.frame((summary(out_coh)$coef[c(1:5),3])^2,(summary(out_ncc)$coef[c(1:5),3])^2,(summary(out_coh)$coef[c(1:5),3])^2 / (summary(out_ncc)$coef[c(1:5),3])^2 )
colnames(var) <- c("cohort var", "NCC var", "ratio coh/ncc" )
var                            # print the variances of estimates from the cohort and the NCC and their ratio
##                          cohort var     NCC var ratio coh/ncc
## sexFemale               0.004283691 0.009158668     0.4677199
## year8594Diagnosed 85-94 0.004380031 0.008937571     0.4900695
## agegrp45-59             0.008868629 0.015782478     0.5619288
## agegrp60-74             0.008258498 0.015907690     0.5191513
## agegrp75+               0.010906905 0.026859266     0.4060761

(f) Generate several nested case-control study and analyse them. A loop is generated with M sampling of NCC with 1 control per case. The codes provide the estimated HR for each loop i an data frame in which the first line contains the cohort’s HR. The codes provide also a summary table with the cohort’s HR and the mean and sd of the HR provided by the M loops. The histograms for each of the variables include a green vertical line at the cohort’s HR value and a red line at the HR loops’ mean.

M <- 10                   # Number of loops: change the M value to change the number of loops
param   <- matrix(0,M,5)  # Define the matrice of the coefficients
for (i in 1:M)  {         # Start of the loop, create NCC data and analyse it
    nccdata <-ccwc( entry=0, exit=surv_10y , fail=dc, origin=0, controls=1, include=list(sex,year8594,agegrp), data=mel )
    out_ncc <- clogit(Fail ~ sex + year8594 + agegrp + strata(Set), data=nccdata)
    param [ i , 1:5] <- coef(out_ncc)   # store the 5 coefficients M times
}                  # End of the loop
param <- as.data.frame(param)        # data frame with the coefficients
param <- exp(param)                  # data frame with the HR
param <- rbind (summary(out_coh)$coef[c(1:5),2], param)
colnames(param) <- c("sex Female","year_8594","agegrp45-59","agegrp60-74","agegrp75+")
rownames(param) <- c("cohort", c(1:M))
param
##        sex Female year_8594 agegrp45-59 agegrp60-74 agegrp75+
## cohort  0.5888144 0.7168836    1.326397    1.857323  3.372652
## 1       0.5382320 0.7554231    1.323630    1.884682  3.038380
## 2       0.6369779 0.6956233    1.162784    1.836277  2.708617
## 3       0.5675498 0.6224011    1.426560    2.267996  4.203403
## 4       0.5044197 0.6291665    1.108576    1.675955  4.005038
## 5       0.5291304 0.7551900    1.466869    1.951552  3.754084
## 6       0.5297884 0.7170982    1.379897    2.153230  3.479139
## 7       0.4756949 0.6473234    1.236832    1.701251  3.299077
## 8       0.5030739 0.7403986    1.367365    1.852736  4.099057
## 9       0.5579726 0.7307389    1.546071    1.884778  3.715192
## 10      0.5650544 0.7646863    1.305554    1.999088  4.315558
mean_param <- apply(param[c(2:M),], 2, mean)  # compute the mean of the HR for the M loops
sd_param <-  apply(param[c(2:M),], 2, sd)     # compute the sd of the HR for the M loops
par_sum <- rbind (summary(out_coh)$coef[c(1:5),2], mean_param, sd_param)
rownames(par_sum) <- c("cohort HR","mean HR NCC","sd HR NCC")
colnames(par_sum) <- c("sex Female","year_8594","agegrp45-59","agegrp60-74","agegrp75+")
par_sum
##             sex Female  year_8594 agegrp45-59 agegrp60-74 agegrp75+
## cohort HR   0.58881445 0.71688362   1.3263971   1.8573227 3.3726523
## mean HR NCC 0.53809330 0.69926256   1.3353979   1.9120509 3.5891097
## sd HR NCC   0.04669607 0.05340294   0.1432123   0.1927937 0.5035246
par(mfrow=c(2,3) )                                   # allow 2*3 graphs on the same page
hist(param[,1], main="sex Female", xlab="HR value")  # histogram of HR for variable sex (female vs male)
abline(v=summary(out_coh)$coef[1,2], col="green")
abline(v=mean_param[1], col="red")
hist(param[,2], main="year_8594", xlab="HR value")   # histogram of HR for variable period (later vs earlier)
abline(v=summary(out_coh)$coef[2,2], col="green")
abline(v=mean_param[2], col="red")
hist(param[,3], main="agegrp45-59", xlab="HR value") # histogram of HR for variable agegrp (45-59 vs reference)
abline(v=summary(out_coh)$coef[3,2], col="green")
abline(v=mean_param[3], col="red")
hist(param[,4], main="agegrp60-74", xlab="HR value") # histogram of HR for variable agegrp (60-74 vs reference)
abline(v=summary(out_coh)$coef[4,2], col="green")
abline(v=mean_param[4], col="red")
hist(param[,5], main="agegrp75+", xlab="HR value")   # histogram of HR for variable agegrp (75+ vs reference)
abline(v=summary(out_coh)$coef[5,2], col="green")
abline(v=mean_param[5], col="red")
                                        # obs: the vertical green line is at the cohort's HR value, the red line at the mean of the M loops