###############################
#
# EXAMPLE OF RUNNING THE DATA EXTRACTION R PROGRAMS
#
# April 19, 2017
#
# JM Satagopan (satagopj@mskcc.org)
# A Iasonos (iasonosa@mskcc.org)
# JG Kanik (kanikj@mskcc.org)
# Memorial Sloan Kettering Cancer Center, New York, NY, USA  
#
# Title of Paper: "A reconstructed melanoma data set for evaluating Differential treatment benefit
# according to biomarker subgroups"
#
# Journal: Data in Brief
#
#
# The WORD document computer-programs-for-web-page.docx contains all the R programs, the instructions given below, 
# and additional R programs for evaluating differential treatment benefits that are described in: 
#
# JM Satagopan and A Iaosnos (2017). Measuring differential treatment benefit across marker specific subgroups:
# A choice of the outcome scale. Contemporary Clinical Trials, PMID: 28254404.
# pii: S1551-7144(16)30093-3. doi: 10.1016/j.cct.2017.02.007. 
#
####################
#
#
# Before running these R programs, you need to use Adobe Illustrator to 
# extract the lines from Kaplan-Meier curves, and use a digitization software
# to extract the (x,y) coordinates for various points on the line. These steps
# are described in our paper (title given above). 
#
# There are 6 text files containing (x,y) coordinates. These files give the "time" (x-axis)
# and "kaplan-meier estimates of survival probabilities" (y-axis) for the 6 lines given in 
# Figures 1B and 1C of:
# 
# J Larkin et al (2015), New England Journal of Medicine, 373: 23 - 34. PMID: 26027431.
# https://www.ncbi.nlm.nih.gov/pubmed/26027431
#
#
# These 6 lines correspond to 3 treatments (nivolumab monotherapy, ipilimumab monotherapy, 
# and combination therapy) and 2 PD-L1 expression levels (positive and negative). 
# 
# The (x,y) values corresponding to these lines are stored in the following text files:
#
#  pdl1-negative-nivo.txt
#  pdl1-negative-ipi.txt
#  pdl1-negative-combo.txt
#  pdl1-positive-nivo.txt
#  pdl1-positive-ipi.txt
#  pdl1-positive-combo.txt
#
#
# Please see our paper in "Data in Brief" for instructions on how to obtain these 6 files using
# Adobe Illustrator and a digitization softrware package. 
#
# These 6 text files, program-1.R and program-2.R can be used to obtain patient-level data
# using the instructions given below. The R object "individual.data" will contain the 
# required data set, which is also given in the file LARKIN-PFS-PATIENT.DATA.txt
#
#
###############################

###################
#
# Read program-1.R and program-2.R using the source function
#
################### 

source("program-1.R")
source("program-2.R")


#############
#
# First, digitize each line in Figure 1B and 1C of Larkin et al (2015, PMID: 26027431) 
# to obtain the (x,y) coordinates for each line that correspond to times and 
# survival probabilities. This is described in Step 2 of the manuscript
# 
# For each line, this will result in a matrix with 2 columns. 
# Column 1 is time and Column 2 is survival probability for that time. 
# This will be given for various time points. 
#
# This will result in a total of 6 such data files, one file per line. 
# There are 3 lines in Figure 1B and 3 in Figure 1C or Larkin et al. Hence, 6 files. 
# 
# We have named 6 these files as follows: 
# 
# pdl1-negative-nivo.txt, pdl1-negative-ipi.txt, pdl1-negative-combo.txt,
# pdl1-positive-nivo.txt, pdl1-positive-ipi.txt, pdl1-positive-combo.txt
# 
# The R object digitized.file.names, given below, contains the names of these files. 
#
##############


digitized.file.names = c("pdl1-negative-nivo.txt", "pdl1-negative-ipi.txt", 
                                      "pdl1-negative-combo.txt", "pdl1-positive-nivo.txt", 
                                      "pdl1-positive-ipi.txt", "pdl1-positive-combo.txt")


################################
#
# Now, look below Figures 1B and 1C of Larkin et al and write down the 
# number at risk data given for various time points for each line. 
#
# The list numbers.below.figure, shown below, contains these data. 
#
################################

numbers.below.figure = list(  
                              pdl1.negative.nivo = c(208, 192, 178, 108, 105, 98, 88, 80, 
                                                                  76, 74, 63, 50, 31, 24, 9, 5, 4, 2, 1, 1) ,    
                              pdl1.negative.ipi = c(202, 183, 166, 82, 72, 59, 44, 39, 35, 
                                                               31, 26, 22, 12, 8, 3, 1),
                              pdl1.negative.combo = c(210, 195, 181, 142, 134, 123, 112, 106, 
                                                                      105, 96, 88, 79, 42, 36, 13, 9, 6, 2, 1),
                              pdl1.positive.nivo =  c(80, 76, 71, 57, 56, 54, 51, 49, 49, 43, 38, 
                                                                  32, 16, 13, 5, 4, 2),
                              pdl1.positive.ipi = c(75, 69, 66, 40, 33, 24, 22, 21, 21, 17, 16, 15, 
                                                              9, 6, 3, 2, 2),
                              pdl1.positive.combo =  c(68, 63, 61, 53, 52, 47, 44, 42, 42, 39, 
                                                                      34, 24, 16, 12, 3, 1, 1)
                              )


####################
#
# Now, specify how far along on the x-axis of each line we want to go to extract data. 
# For example, in one line we may want to go up to time 15 units, 
# in another line up to time 18 units etc. 
# 
# As above, organize these times (integer values) for each line in the same order 
# as the sheets in the excel file. 
#
####################

time = list( time.pdl1.neg.nivo = 0:18, 
             time.pdl1.neg.ipi = 0:15,
             time.pdl1.neg.combo = 0:18, 
             time.pdl1.pos.nivo = 0:17, 
             time.pdl1.pos.ipi = 0:17, 
             time.pdl1.pos.combo = 0:17
            )



################
#
# arm indicator
#
# 1 = pdl1-neg-nivo
# 2 = pdl1-neg-ipi
# 3 = pdl1-neg-combo
# 4 = pdl1-pos-nivo
# 5 = pdl1-pos-ipi
# 6 = pdl1-pos-combo
#
################


##############
#
# The R functions preprocess.digitized.data (Program 1) 
# and Guyot.individual.data (Program 2) are given below. 
# Read them into R first. Then execute the commands below to get individual.data.
#
##############

individual.data = NULL
for(ifile in 1:length(digitized.file.names)){
    digitized.line = read.table(digitized.file.names[ifile], header=T)
    processed.line.data = preprocess.digitized.data(digitized.line, 
                                                                                numbers.below.figure[[ifile]], 
                                                                                time[[ifile]])
    individual.line.data = Guyot.individual.data(processed.line.data$condensed.data.set, 
                                                                        processed.line.data$nrisk.data, 
                                                                        input.arm.id=ifile)
    individual.data = rbind(individual.data, individual.line.data)
}

treatment.type = c(
                   rep("nivolumab", length(which(individual.data[,"tmt.arm.number"] == 1))),
                   rep("ipilimumab", length(which(individual.data[,"tmt.arm.number"] == 2))),
                   rep("combination", length(which(individual.data[,"tmt.arm.number"] == 3))),
                   rep("nivolumab", length(which(individual.data[,"tmt.arm.number"] == 4))),
                   rep("ipilimumab", length(which(individual.data[,"tmt.arm.number"] == 5))),
                   rep("combination", length(which(individual.data[,"tmt.arm.number"] == 6))))                   

pdl1.status = c(rep("negative", length(which(individual.data[,"tmt.arm.number"] < 4))), 
                rep("positive", length(which(individual.data[,"tmt.arm.number"] >= 4))))

individual.data = as.data.frame(individual.data)
individual.data$treatment.type = treatment.type
individual.data$pdl1.status = pdl1.status

############
#
# The R object "individual.data" contains the digitized data set
#
############
