#############################################################
# hc.clustid.GP.R                            Dec.21, 2005
#     Yujin Hoshida (Broad Institute)
#############################################################

hc.clustid.GP<-function(input.filename=NULL,output.file=NULL,min.sample=10,cls.start.from="1",sample.info="F")
{

  search.depth<-30
  min.sample<-as.integer(min.sample)

  dchip<-"F"

  # from .atr file
  
  if (length(grep(".atr",input.filename))!=0){

    input.data<-read.delim(input.filename,header=F)

    output.file<-sub(".atr","",output.file)

    # convert .atr to dChip tree file
    child.1<-input.data[,1:2]
    child.2<-input.data[,c(1,3)]
    colnames(child.1)<-colnames(child.2)<-c("Parent","Name")
    dchip.master<-rbind(child.1,child.2)

    # child
    child.number<-as.character(dchip.master$Name)
    child.number<-gsub("X","",child.number)
    child.number<-gsub("ARRY","",child.number)
    child.number<-gsub("NODE","999",child.number)
    child.number<-as.data.frame(as.numeric(child.number))
    colnames(child.number)<-"Number"

    # parent
    parent.number<-as.character(dchip.master$Parent)
    parent.number<-gsub("X","",parent.number)
    parent.number<-gsub("ARRY","",parent.number)
    parent.number<-gsub("NODE","999",parent.number)
    parent.number<-as.data.frame(as.numeric(parent.number))
    colnames(parent.number)<-"Parent"

    data<-cbind(child.number,parent.number)
    last.line<-t(as.data.frame(c(-1,max(data$Parent))))
    colnames(last.line)<-c("Parent","Number")

    data<-rbind(data,last.line)
    data<-data[order(data$Number),]

    num.sample<-(length(input.data[,1])+1)
    num.node<-length(data[,1])
    root.node<-data$Number[num.node]
  }

  # from dChip Tree file
  
  if (length(grep("tree.xls",input.filename))!=0){
    dchip<-"T"
    pre.input.data<-read.delim(input.filename,header=F,skip=2)
    input.data<-pre.input.data[,1:7]  # eliminate 8th "NA" column
    output.file<-sub(" cluster tree.xls","",output.file)

    # get gene/sample number from header
    for.header<-read.delim(input.filename,header=F)
    header<-as.matrix(for.header[1,])
    num.gene<-as.numeric(header[2])
    num.sample<-as.numeric(header[3])
    rm(for.header)
    rm(header)

    # extract sample tree info
    start.row.sample<-(num.gene+num.gene)
    end.row.sample<-(length(input.data[,1]))
    num.node<-(end.row.sample-start.row.sample+1)
    data<-input.data[start.row.sample:end.row.sample,2:4]
    colnames(data)<-c("Number","Parent","Child1")
    num.sample<-length(data$Child1[data$Child1==-1])
    root.node<-data$Number[num.node]
  }

  # convert dChip tree file into a hierarchy matrix

  pc.matrix.w.na<-matrix(NA,nrow=num.sample,ncol=search.depth)
  max.depth<-0

  for (c in 1:num.sample){
    child<-data$Number[c]
    parent<-NA
    pc.vector<-child
    for (depth in 1:search.depth){
      parent<-get.parent(data,child,num.node)
      if (parent==-1){                        # parent==-1: root node
        break
      }else{
        pc.vector<-c(pc.vector,parent)
        child<-parent
      }
    }
    if (max.depth<length(pc.vector)){     # update actual max depth
      max.depth<-length(pc.vector)
    }
    pc.matrix.w.na[c,1:length(pc.vector)]<-rev(pc.vector)
  }

  pc.matrix<-pc.matrix.w.na[,1:max.depth]
#  write.table(pc.matrix,"pc.matrix.txt",quote=F,sep="\t",row.names=T,col.names=F)

  # generate clustid

  pc.data.frame<-as.data.frame(pc.matrix)

  # replace NA by parent
  for (na.col in 2:length(pc.data.frame[1,])){
    for (na.row in 1:num.sample){
      if (is.na(pc.data.frame[na.row,na.col])){
        pc.data.frame[na.row,na.col]<-pc.data.frame[na.row,(na.col-1)]
      }
    }
  }

  for (i in 3:length(pc.data.frame[1,])){
    tab<-table(pc.data.frame[,(i-1):i])
    lower.k<-length(tab[,1])
    for (low in 1:lower.k){
      if (min(tab[low,][tab[low,]!=0])<min.sample){
        row.label<-rownames(tab)
        parent.name<-as.numeric(row.label[low])
        for (j in 1:num.sample){
          if (pc.data.frame[j,(i-1)]==parent.name){
            pc.data.frame[j,i]<-parent.name
          }
        }
      }
    }
  }

  # summarize redundant column

  hc.clustid<-NULL             # remove root node
  for (i in 2:length(pc.data.frame[1,])){
    if (sum(abs(pc.data.frame[,(i-1)]-pc.data.frame[,i]))!=0){
      hc.clustid<-cbind(hc.clustid,pc.data.frame[,i])
    }
  }

  # change cls names to 1,2,...

  for (c in 1:length(hc.clustid[1,])){
    tab<-table(as.data.frame(hc.clustid[,c]))
    cls.name<-as.numeric(rownames(tab))
    cls.name.rank<-rank(cls.name)
    for (r in 1:length(hc.clustid[,1])){
      for (cls in 1:length(cls.name)){
        if (hc.clustid[r,c]==cls.name[cls]){
          hc.clustid[r,c]<-cls.name.rank[cls]
        }
      }
    }
  }

  # output

  out.clustid<-hc.clustid

  # .cls files
  for (i in 1:length(hc.clustid[1,])){
    output.file.cls<-paste(output.file,"_",max(hc.clustid[,i]),"classes.cls",sep="")
    header<-paste(length(hc.clustid[,1]),max(hc.clustid[,i]),1,sep=" ")
    if (cls.start.from!="1"){
      out.clustid[,i]<-hc.clustid[,i]-1
    }
    cls.name<-t(as.data.frame(c("#",unique(out.clustid[,i]))))
    cls.label<-t(as.data.frame(out.clustid[,i]))

    write.table(header,output.file.cls,quote=F,sep="",row.names=F,col.names=F)
    write.table(cls.name,output.file.cls,quote=F,sep=" ",row.names=F,col.names=F,append=T)
    write.table(cls.label,output.file.cls,quote=F,sep=" ",row.names=F,col.names=F,append=T)
  }

  # dChip sample info file
  if (sample.info=="T"){
    if (dchip=="T"){
      sample.name<-as.character(input.data[start.row.sample:(start.row.sample+num.sample-1),1])
      if (cls.start.from!="1"){
        out.clustid<-hc.clustid-1
      }
      cls.col.label<-NULL
      for (i in 1:length(hc.clustid[1,])){
        cls.col.label<-c(cls.col.label,paste("No.of.classes_",max(hc.clustid[,i]),sep=""))
      }
      out.clustid<-cbind(sample.name,sample.name,out.clustid)
      colnames(out.clustid)<-c("SampleName","SampleName",cls.col.label)

      write.table(out.clustid,paste(output.file,"_sample_info.txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T)
    }
  }

}



# auxiliary function

get.parent<-function(data,child,num.node)
{

  parent.out<-NA
  for (p in 1:num.node){
    if (data$Number[p]==child){
      parent.out<-data$Parent[p]
    }
  }
  return(parent.out)
}
      


#na.omited<-t(apply(pc.matrix,1,na.omit))   # for both df & matrix
