######################################################################
#
# PROGRAM 2: R Function Guyot.individual.data
#
#
# R FUNCTION TO GET INDIVIDUAL PATIENT DATA USING 
# THE PREPROCESSED DIGITIZED DATA.
#
# This R function is developed entirely by 
# Guyot et al (2012), BMC Medical Research Methodology, 12: 9, PMID:22297116
#
# We have converted it to a function called Guyot.individual.data 
# for ease of use to extract data from multiple lines, as needed in our applications. 
#
#
# The input are:
#          condensed.data = preprocessed digitized data
#
#          nrisk.data = summary of number of patients at risk, as required for this function.
#                              Details are in the Guyot et al (2012) paper. 
#
#          input.arm.id = treatment/biomarker arm. 
# 
#          tot.events = total number of events, if given in the paper that reports the figure. 
#                              NA if it is not reported in the paper. 
#
#############


Guyot.individual.data = function(condensed.data, nrisk.data, input.arm.id, tot.events="NA"){

      #Algorithm to create a raw dataset from DigizeIt readings from a Kaplan-Meier curve
      library("MASS")
      library("splines")
      library("survival")

      ############################################
      #Read in survival times read by digizeit
      ############################################
      digizeit<- condensed.data
      t.S<-digizeit[,1]
      orig.S<-digizeit[,2]/100

      S = orig.S
      S[1] = orig.S[1]
      for(i in 2:length(orig.S)){
        if(S[i] <= S[i-1]) S[i] = S[i]
        if(S[i] > S[i-1]) S[i] = S[i-1]
      }
      arm.id = input.arm.id

      #Read in published numbers at risk, n.risk, at time, t.risk, lower and upper
      # indexes for time interval
      pub.risk<-nrisk.data
      t.risk<-pub.risk[,1]
      lower<-pub.risk[,2]
      upper<-pub.risk[,3]
      n.risk<-pub.risk[,4]
      n.int<-length(n.risk)
      n.t<- upper[n.int]

      #Initialise vectors
      arm<-rep(arm.id,n.risk[1])
      n.censor<- rep(0,(n.int-1))
      n.hat<-rep(n.risk[1]+1,n.t)
      cen<-rep(0,n.t)
      d<-rep(0,n.t)
      KM.hat<-rep(1,n.t)
      last.i<-rep(1,n.int)
      sumdL<-0

      if (n.int > 1){
      #Time intervals 1,...,(n.int-1)
          for (i in 1:(n.int-1)){
#      print(paste("i is ", i))
                #First approximation of no. censored on interval i
                n.censor[i]<- round(n.risk[i]*S[lower[i+1]]/S[lower[i]]- n.risk[i+1])

#      print(paste("n.censor[i] is ", n.censor[i]))
      #          ############# ADDED BY JAYA #############
                if(n.censor[i] <= 0){
                  n.censor[i] <- 0
                  cen[lower[i]:upper[i]] <- 0
                }
      #          ############ END ADDED BY JAYA #############

                #Adjust tot. no. censored until n.hat = n.risk at start of interval (i+1)
                while((n.hat[lower[i+1]]>n.risk[i+1])||((n.hat[lower[i+1]]<n.risk[i+1])&&(n.censor[i]>0))){
                    if (n.censor[i]<=0){
                        cen[lower[i]:upper[i]]<-0
                        n.censor[i]<-0
                    }
                    if (n.censor[i]>0){
                        cen.t<-rep(0,n.censor[i])
                        for (j in 1:n.censor[i]){
                            cen.t[j]<- t.S[lower[i]] +
                                           j*(t.S[lower[(i+1)]]-t.S[lower[i]])/(n.censor[i]+1)
                        }
                       #Distribute censored observations evenly over time. Find no. censored on each time interval.
                       cen[lower[i]:upper[i]]<-hist(cen.t,breaks=t.S[lower[i]:lower[(i+1)]], plot=F)$counts
                    }
                   #Find no. events and no. at risk on each interval to agree with K-M estimates read from curves
                   n.hat[lower[i]]<-n.risk[i]
                   last<-last.i[i]
                   for (k in lower[i]:upper[i]){
                       if (i==1 & k==lower[i]){
                          d[k]<-0
                          KM.hat[k]<-1
                       }
                      else {
                          d[k]<-round(n.hat[k]*(1-(S[k]/KM.hat[last])))
                          KM.hat[k]<-KM.hat[last]*(1-(d[k]/n.hat[k]))
                      }
                     n.hat[k+1]<-n.hat[k]-d[k]-cen[k]
                     if (d[k] != 0) last<-k
                   }

                   n.censor[i]<- n.censor[i]+(n.hat[lower[i+1]]-n.risk[i+1])

      #          ############# ADDED BY JAYA #############
                if(n.censor[i] <= 0){
                  n.censor[i] <- 0
                  cen[lower[i]:upper[i]] <- 0
                }
      #          ############ END ADDED BY JAYA #############


                }
               if (n.hat[lower[i+1]]<n.risk[i+1]) n.risk[i+1]<-n.hat[lower[i+1]]
               last.i[(i+1)]<-last
          }
      }

#Time interval n.int.
      if (n.int>1){
      #Assume same censor rate as average over previous time intervals.
          n.censor[n.int]<- min(round(sum(n.censor[1:(n.int-1)])*(t.S[upper[n.int]]-
                            t.S[lower[n.int]])/(t.S[upper[(n.int-1)]]-t.S[lower[1]])), n.risk[n.int])
      }
      if (n.int==1){n.censor[n.int]<-0}
      if (n.censor[n.int] <= 0){
          cen[lower[n.int]:(upper[n.int]-1)]<-0
          n.censor[n.int]<-0
      }
      if (n.censor[n.int]>0){
          cen.t<-rep(0,n.censor[n.int])
          for (j in 1:n.censor[n.int]){
              cen.t[j]<- t.S[lower[n.int]] +
                         j*(t.S[upper[n.int]]-t.S[lower[n.int]])/(n.censor[n.int]+1)
          }
          cen[lower[n.int]:(upper[n.int]-1)]<-hist(cen.t,breaks=t.S[lower[n.int]:upper[n.int]],
                                                   plot=F)$counts
      }

      #Find no. events and no. at risk on each interval to agree with K-M estimates read from curves
      n.hat[lower[n.int]]<-n.risk[n.int]
      last<-last.i[n.int]
      for (k in lower[n.int]:upper[n.int]){
          if(KM.hat[last] !=0){
              d[k]<-round(n.hat[k]*(1-(S[k]/KM.hat[last])))
          } 
          else {d[k]<-0}
          KM.hat[k]<-KM.hat[last]*(1-(d[k]/n.hat[k]))
          n.hat[k+1]<-n.hat[k]-d[k]-cen[k]
          #No. at risk cannot be negative
          if (n.hat[k+1] < 0) {
              n.hat[k+1]<-0
              cen[k]<-n.hat[k] - d[k]
          }
          if (d[k] != 0) last<-k
      }

      #If total no. of events reported, adjust no. censored so that total no. of events agrees.
      if (tot.events != "NA"){
          if (n.int>1){
              sumdL<-sum(d[1:upper[(n.int-1)]])
              #If total no. events already too big, then set events and censoring = 0 on all further time intervals
              if (sumdL >= tot.events){
                  d[lower[n.int]:upper[n.int]]<- rep(0,(upper[n.int]-lower[n.int]+1))
                  cen[lower[n.int]:(upper[n.int]-1)]<- rep(0,(upper[n.int]-lower[n.int]))
                  n.hat[(lower[n.int]+1):(upper[n.int]+1)]<- rep(n.risk[n.int],(upper[n.int]+1-lower[n.int]))
              }
          }
          #Otherwise adjust no. censored to give correct total no. events
          if ((sumdL < tot.events)|| (n.int==1)){
              sumd<-sum(d[1:upper[n.int]])
              while ((sumd > tot.events)||((sumd< tot.events)&&(n.censor[n.int]>0))){
                  n.censor[n.int]<- n.censor[n.int] + (sumd - tot.events)
                  if (n.censor[n.int]<=0){
                      cen[lower[n.int]:(upper[n.int]-1)]<-0
                      n.censor[n.int]<-0
                  }
                  if (n.censor[n.int]>0){
                      cen.t<-rep(0,n.censor[n.int])
                      for (j in 1:n.censor[n.int]){
                          cen.t[j]<- t.S[lower[n.int]] +
                                     j*(t.S[upper[n.int]]-t.S[lower[n.int]])/(n.censor[n.int]+1)
                      }
                      cen[lower[n.int]:(upper[n.int]-1)]<-hist(cen.t,breaks=t.S[lower[n.int]:upper[n.int]],
                                                               plot=F)$counts
                  }
                 n.hat[lower[n.int]]<-n.risk[n.int]
                 last<-last.i[n.int]
                 for (k in lower[n.int]:upper[n.int]){
                      d[k]<-round(n.hat[k]*(1-(S[k]/KM.hat[last])))
                      KM.hat[k]<-KM.hat[last]*(1-(d[k]/n.hat[k]))
                      if (k != upper[n.int]){
                          n.hat[k+1]<-n.hat[k]-d[k]-cen[k]
                          #No. at risk cannot be negative
                          if (n.hat[k+1] < 0) {
                              n.hat[k+1]<-0
                              cen[k]<-n.hat[k] - d[k]
                          }
                      }
                      if (d[k] != 0) last<-k
                  }
                  sumd<- sum(d[1:upper[n.int]])
              }
          }
      }

###### Guyot's write statement commented out here inside the function since we are only
###### interested in the individual-level data, which comes below as IPD
#write.table(matrix(c(t.S,n.hat[1:n.t],d,cen),ncol=4,byrow=F),paste(path,KMdatafile,sep=""),sep="\t")

      ### Now form IPD ###
      #Initialise vectors
      t.IPD<-rep(t.S[n.t],n.risk[1])
      event.IPD<-rep(0,n.risk[1])
      #Write event time and event indicator (=1) for each event, as separate row in t.IPD and event.IPD
      k=1
      for (j in 1:n.t){
          if(d[j]!=0){
              t.IPD[k:(k+d[j]-1)]<- rep(t.S[j],d[j])
              event.IPD[k:(k+d[j]-1)]<- rep(1,d[j])
              k<-k+d[j]
          }
      }
      #Write censor time and event indicator (=0) for each censor, as separate row in t.IPD and event.IPD
      for (j in 1:(n.t-1)){
          if(cen[j]!=0){
              t.IPD[k:(k+cen[j]-1)]<- rep(((t.S[j]+t.S[j+1])/2),cen[j])
              event.IPD[k:(k+cen[j]-1)]<- rep(0,cen[j])
              k<-k+cen[j]
          }
      }
      #Output IPD
      IPD<-matrix(c(t.IPD,event.IPD,arm),ncol=3,byrow=F)
      colnames(IPD) = c("time", "event", "tmt.arm.number")
      
###### Guyot's write statement commented out since we will return the IPD data
#write.table(IPD,paste(path,KMdataIPDfile,sep=""),sep="\t", quote=F, row.names=F)

    return(IPD)
}


