# Metagene Projection Methodology R script -- Leukemia 2 Example on only ALL samples
#
# Pablo Tamayo 3/29/2007 (tamayo@broad.mit.edu)
#
# This script runs the second Leukemia example (only ALL samples) from the paper:
#
# Metagene projection for cross-platform, cross-species characterization of global transcriptional states
# P. Tamayo, D. Scanfeld, B. L. Ebert, M. A. Gillette, C. W. M. Roberts, and J.P. Mesirov  
# Proc. Natl. Acad. Sci. USA, 104: 5959-5964 2007. http://www.pnas.org/cgi/content/abstract/0701068104v1
#
# It uses the main function "MetaGene.Projection(...)" which implement most of the method described in the
# paper (except for the "k" number-of-metagenes selection).
# The MP library is loaded (sourced) with other functions from library: MP.Library.R"
# The GSEA analysis of the resulting metagenes and analysis of different choices for k (model selection)
# are implemented using separate functions and scripts.
#
# To run: cut and paste (or source) this code inside the R GUI console. The plots will be produced in
# the R GUI screen and also saved in files. Before running on new datasets try to reproduce the Leukemia1 example
# from the paper. Then to run on different datasets e.g. modify the pathnames and parameters below accordingly.
# This script takes about 6:29 minutes of CPU to run using R 2.2.1 on a Windows-XP Dell Inspiron 630m laptop.
#
# While running this script calls "MetaGene.Projection" (see below) which will produce the following output files
# under the directory specified by "output.dir":
#
# Main output files:
# <identifier>.<date>_<time>.log.txt = File containing the parameters used in the run and the data ans time
# <identifier>.model_dataset.H.gct = projected model dataset
# <identifier>.all.H.cls =  projection of model + test datasets (cls phenotypes)
# <identifier>.all.H.gct = projection of model + test datasets (gct dataset)
# <identifier>.heatmap.jpeg = heat map of projection
# <identifier>.heatmap.sorted.jpeg = heat map of projection sorted inside each phenotype
# <identifier>.2D.proj.jpeg = 2D biplot of projected model and test datasets
# <identifier>.H.htree.jpeg = hierarchical tree built on the projected model and test datasets
# <identifier>.pred.gct = projection-based SVM prediction results (gct dataset)
# <identifier>.pred.txt = projection-based SVM prediction results (text file)
# <identifier>.H.mem.txt = clustering membership based on metagene with largest amplitude
#
# Other complementary output files:
# <identifier>.model.H.gct = H matrix from the NMF decomposition
# <identifier>.model.W.gct = W matrix from the NMF decomposition
# <identifier>.model_set.2.cls = model dataset after pre-preprocessing and refinement (cls phenotypes)
# <identifier>.model_set.2.gct = model dataset after pre-preprocessing and refinement (gct file)
# <identifier>.model_set.1.cls = model dataset after pre-preprocessing and before refinement (cls phenotypes)
# <identifier>.model_set.1.gct = model dataset after pre-preprocessing and before refinement (gct files)
# <identifier>.model_set.0.cls = model dataset before pre-preprocessing but containing samples after refinement (cls phenotypes)
# <identifier>.model_set.0.gct = model dataset before pre-preprocessing but containing samples after refinement (gct file)
# <identifier>.htree.jpeg = hierarchical tree on original pre-projection dataset
# <identifier>.all.cls = consolidated model + test dataset in the space of common genes (cls phenotypes)
# <identifier>.all.gct = consolidated model + test dataset in the space of common genes (gct dataset)
# <identifier>.prelim.pred.txt = preliminary projection-based SVM prediction results (used in refinement) (text file)

MP.library.location  <-  "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/MP.Library.R"
source(MP.library.location, verbose=T, max.deparse.length=9999)   # Load Metagene Projection library 

# Define model & test datasets and parameters

model.dataset.table <- # Defines the input model dataset and pre-processing options for it
   list( # St. Jude dataset Ross et al 2003 (PMID: 12730115)
      gct.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/ALL_only.gct", 
      cls.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/ALL_only.cls", 
      column.subset = "ALL",        # Which subset (samples or phenotypes) to include (default: "ALL" : all of them)
      column.sel.type = "samples",  # Selection type: "sample": or "phenotypes"
      thres = 20,                   # Threshold to apply to dataset before projection
      ceil = 100000,                # Ceiling to apply to dataset before projection
      fold = 8,                     # Fold change (max/min) for variation filter before projection
      delta = 800,                  # Absolute difference (max - min) for variation filter before projection
      norm = 6)                     # Normalization before projection (default 6 column-rank and rescaling normalization)

test.datasets.table <- # Defines one or more input test datasets and pre-processing options for each of them
  list(
     list( # BCR from Fine et al
        gct.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Fine_et_al.maxed.gct",  
        cls.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Fine_et_al.cls",
        column.subset = c(1, 2, 13, 18, 27, 29, 39, 44, 45, 47, 49),
        column.sel.type = "samples",  
        thres = "NULL",
        ceil = "NULL",
        norm = 6),
     list( # ALL_E2A from Yeoh et al 2002 (PMID: 12086872)
        gct.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Yeoh_et_al.gct",  
        cls.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Yeoh_et_al.cls",
        column.subset = seq(2, 10), 
        column.sel.type = "samples",  
        thres = 20,
        ceil = 100000,
        norm = 6),
     list( # ALL MLL from Armstrong et al 2002 (PMID: 11731795)
        gct.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Armstrong_et_al.gct",  
        cls.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Armstrong_et_al.cls",
        column.subset = seq(25, 44),
        column.sel.type = "samples",  
        thres = 20,
        ceil = 100000,
        norm = 6),
     list( # MLL from Fine et al
        gct.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Fine_et_al.maxed.gct",  
        cls.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Fine_et_al.cls",
        column.subset = c(5, 10, 14, 19, 22, 28, 37, 38, 43, 48, 52),
        column.sel.type = "samples",  
        thres = "NULL",
        ceil = "NULL",
        norm = 6),
     list( # ALL_T from Chiaretti et al 2004 (PMID: 14684422)
        gct.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Chiaretti_et_al.gct",  
        cls.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Chiaretti_et_al.cls",
        column.subset = "ALL",
        column.sel.type = "samples",  
        thres = 20,
        ceil = 100000,
        norm = 6),
     list( # ALL_T from Yeoh et al 2002 (PMID: 12086872)
        gct.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Yeoh_et_al.gct",  
        cls.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Yeoh_et_al.cls",
        column.subset = seq(12, 31),
        column.sel.type = "samples",  
        thres = 20,
        ceil = 100000,
        norm = 6),
     list( # ALL_TEL from Yeoh et al 2002 (PMID: 12086872)
        gct.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Yeoh_et_al.gct",  
        cls.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Yeoh_et_al.cls",
        column.subset = seq(42, 61),
        column.sel.type = "samples",  
        thres = 20,
        ceil = 100000,
        norm = 6),
     list( # TEL from Fine et al
        gct.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Fine_et_al.maxed.gct",  
        cls.file = "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/Fine_et_al.cls",
        column.subset = c(6, 9, 15, 16, 17, 20, 23, 24, 25, 30, 33, 42, 51),
        column.sel.type = "samples",  
        thres = "NULL",
        ceil = "NULL",
        norm = 6)
    )

# Define parameters for this specific run (see detailed definitions below)

identifier          <-    "Leukemia2.ALL"
k.proj              <-           5
alg                 <-    "NMF.div"
niter               <-        2000
seed                <-        1234
nchar.phen          <-           9
postprojnorm        <-           T
use.biplot          <-        TRUE
non.interactive.run <-       FALSE
heatmap.row.norm    <-       FALSE
heatmap.cmap.type   <-           6
use.feature.names   <-       FALSE
high.conf.thres     <-        0.30
output.dir          <-       "c:/CGP2006/MPM/Software_and_Examples/Leukemia2/"
kernel              <-     "radial"
cost                <-           1
gamma               <-        0.05
model.set.refinement <-          T

# D1 = Ross ALL, D2 = Ross AML, D3 = Yeoh, D4 = Chiaretti, D5 = Armstrong,
# D6 = Valk, D7 = Gutierrez, D8 = Bullinger, D9 = Bourquin

# These are the symbols and colors to use for each phenotype in the model and test sets 
#          model samples:   square symbols
#            color  symbol      phenotype
legend.list <- c(
         "green2",  22,             # ALL_BCR          ALL_BCR___D1 
         "yellow2", 22,             # ALL_E2A          ALL_E2A___D1 
         "paleturquoise2", 22,      # ALL_MLL          ALL_MLL___D1 
         "chocolate3", 22,          # ALL_T            ALL_T_____D1 
         "blue2", 22,               # ALL_TEL          ALL_TEL___D1
 #          test samples:    circles and other symbols
#            color  symbol      phenotype                 
         "green2",  21,             # ALL_BCR           ALL_BCR___D10 
         "yellow2", 23,             # ALL_E2A           ALL_E2A___D3 
         "paleturquoise2", 24,      # ALL_MLL           ALL_MLL___D5 
         "paleturquoise4", 21,      # ALL_MLL           ALL_MLL___D10 
         "chocolate3", 25,          # ALL_T             ALL_T_____D4 
         "chocolate4", 21,          # ALL_T             ALL_T_____D3 
         "blue2", 23,               # ALL_TEL           ALL_TEL___D3 
         "blue4", 21,               # ALL_TEL           ALL_TEL___D10 
         ) 
         
symbol.scaling <- 0.55
col <- legend.list[seq(1, length(legend.list), 2)]
symbs <- as.numeric(legend.list[seq(2, length(legend.list), 2)])

# This is the call to the Metagene Projection function:

MetaGene.Projection(                           # Runs the entire methodology
                                               #   (except for the GSEA analysis of metagenes and the model (k) selection)
  model.dataset.table = model.dataset.table,   # R list with model dataset parameters (see model.dataset.table above)
  test.datasets.table = test.datasets.table,   # R list with test dataset(s) parameters (see model.dataset.table above)
  identifier = identifier,                     # Prefix to name output files
  k.proj = k.proj,                             # Number of metagenes in projection
  alg = alg,                                   # Algorithm for Metagene Projection (default NMF.div):
                                               #    "NMF.div" : Non-Negative Matrix Factorization using the divergence cost
                                               #  (other algorithms for projection are internally supported but have not being tested)
  niter = niter,                               # Number of algorithm iterations (default: 2000)
  seed = seed,                                 # Random seed to initialize metagene matrices (default: 1234)
  nchar.phen =  nchar.phen,                    # Number of characters to use to identify classes from the CLS files
  postprojnorm = postprojnorm,                 # TRUE or FALSE: apply post-projection normalization (i.e. scale points to unit
                                               #     hypersphere, default: T)
  heatmap.row.norm = heatmap.row.norm,         # TRUE or FALSE: row-normalize (standardize) the rows in the heat map (default F)
  heatmap.cmap.type = heatmap.cmap.type,       # 1 = vintage pinkogram, 2 = scale of grays, 4 = high-resolution pinkogram,
                                               #   6 = redish color map for metagene factors (default: 6)
  high.conf.thres = high.conf.thres,           # Confidence threshold (Brier score) to separate call from no-calls (default 0.3)
  output.dir = output.dir,                     # Output directory where the resulting output files will be produced
  col = col,                                   # Colors for the legend symbols for phenotypes: first model and then test dataset(s)
  symbs = symbs,                               # Plotting symbols for phenotypes: first model and then test dataset(s)
  symbol.scaling = symbol.scaling,             # Graphical scaling for symbols in plots and plot legends (default: 1)
  kernel = kernel,                             # Kernel function for SVM: "radial" or "linear" (default: "radial")
  cost = cost,                                 # Cost parameter for SVM (default: 1)
  gamma = gamma,                               # Gamma coefficient for radial base function kernel (default:  0.05 )
  model.set.refinement = model.set.refinement) # TRUE or FALSE: perform model set refinement (default: T)

# end of metagene projection script example
