# Metagene Projection Methodology R script -- Lung 1 Example
#
# Pablo Tamayo 3/29/2007 (tamayo@broad.mit.edu)
#
# This script runs the Lung 1 example from the paper:
#
# Metagene projection for cross-platform, cross-species characterization of global transcriptional states
# P. Tamayo, D. Scanfeld, B. L. Ebert, M. A. Gillette, C. W. M. Roberts, and J.P. Mesirov  
# Proc. Natl. Acad. Sci. USA, 104: 5959-5964 2007. http://www.pnas.org/cgi/content/abstract/0701068104v1
#
# It uses the main function "MetaGene.Projection(...)" which implement most of the method described in the
# paper (except for the "k" number-of-metagenes selection).
# The MP library is loaded (sourced) with other functions from library: MP.Library.R"
# The GSEA analysis of the resulting metagenes and analysis of different choices for k (model selection)
# are implemented using separate functions and scripts.
#
# To run: cut and paste (or source) this code inside the R GUI console. The plots will be produced in
# the R GUI screen and also saved in files. Before running on new datasets try to reproduce the Leukemia1 example
# from the paper. Then to run on different datasets e.g. modify the pathnames and parameters below accordingly.
# This script takes about 4:30 minutes of CPU to run using R 2.2.1 on a Windows-XP Dell Inspiron 630m laptop.
#
# While running this script calls "MetaGene.Projection" (see below) which will produce the following output files
# under the directory specified by "output.dir":
#
# Main output files:
# <identifier>.<date>_<time>.log.txt = File containing the parameters used in the run and the data ans time
# <identifier>.model_dataset.H.gct = projected model dataset
# <identifier>.all.H.cls =  projection of model + test datasets (cls phenotypes)
# <identifier>.all.H.gct = projection of model + test datasets (gct dataset)
# <identifier>.heatmap.jpeg = heat map of projection
# <identifier>.heatmap.sorted.jpeg = heat map of projection sorted inside each phenotype
# <identifier>.2D.proj.jpeg = 2D biplot of projected model and test datasets
# <identifier>.H.htree.jpeg = hierarchical tree built on the projected model and test datasets
# <identifier>.pred.gct = projection-based SVM prediction results (gct dataset)
# <identifier>.pred.txt = projection-based SVM prediction results (text file)
# <identifier>.H.mem.txt = clustering membership based on metagene with largest amplitude
#
# Other complementary output files:
# <identifier>.model.H.gct = H matrix from the NMF decomposition
# <identifier>.model.W.gct = W matrix from the NMF decomposition
# <identifier>.model_set.2.cls = model dataset after pre-preprocessing and refinement (cls phenotypes)
# <identifier>.model_set.2.gct = model dataset after pre-preprocessing and refinement (gct file)
# <identifier>.model_set.1.cls = model dataset after pre-preprocessing and before refinement (cls phenotypes)
# <identifier>.model_set.1.gct = model dataset after pre-preprocessing and before refinement (gct files)
# <identifier>.model_set.0.cls = model dataset before pre-preprocessing but containing samples after refinement (cls phenotypes)
# <identifier>.model_set.0.gct = model dataset before pre-preprocessing but containing samples after refinement (gct file)
# <identifier>.htree.jpeg = hierarchical tree on original pre-projection dataset
# <identifier>.all.cls = consolidated model + test dataset in the space of common genes (cls phenotypes)
# <identifier>.all.gct = consolidated model + test dataset in the space of common genes (gct dataset)
# <identifier>.prelim.pred.txt = preliminary projection-based SVM prediction results (used in refinement) (text file)

MP.library.location  <-  "c:/CGP2006/MPM/Software_and_Examples/Lung/MP.Library.R"
source(MP.library.location, verbose=T, max.deparse.length=9999)   # Load Metagene Projection library 

# Define model & test datasets and parameters

model.dataset.table <- # Defines the input model dataset and pre-processing options for it
  list( # Subset of samples from the Boston dataset (plus cell lines)
     gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/boston.plus.cell.lines.maxed.2.gct", 
     cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/boston.plus.cell.lines.2.cls",
     column.subset = c(seq(6, 20), seq(26, 40), seq(46, 82)), # Which subset (samples or phenotypes) to include (default: "ALL" : all of them)
     column.sel.type = "samples",  # Selection type: "sample": or "phenotypes"
     thres = "NULL",               # Threshold to apply to dataset before projection
     ceil = "NULL",                # Ceiling to apply to dataset before projection
     fold = 3,                     # Fold change (max/min) for variation filter before projection
     delta = 300,                  # Absolute difference (max - min) for variation filter before projection
     norm = 6)                     # Normalization before projection (default 6 column-rank and rescaling normalization)

test.datasets.table <-  # Defines one or more input test datasets and pre-processing options for each of them
   list(
      list( # Virtanen et al 2002 Integrated classification...
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/virtanen.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/virtanen.cls", 
         column.subset = c(42,43,44,46,47,48,49,53,54,55),  # 10 AD's
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6), 
      list(  # Michigan: Beer et al 2002
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/michigan.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/michigan.2.cls",
         column.subset = c(seq(1, 10), seq(25, 34)),   # 20 AD: 10 NR and 10 R 
         column.sel.type = "samples",  
         thres = 20,
         ceil = 16000,
         norm = 6),
      list(  # Stanford: Garber et al 2001
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/stanford_no_nulls.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/stanford.3.cls",
         column.subset = c(2,8,9,13,18,28,31,34,35,36, 4,11,25,30,37,42,43,50,58,59), # 20 AD: 10 NR, 10 R 
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(  # dataset Jones MH, Virtanen et al
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/jones_no_nulls.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/jones.cls",
         column.subset = seq(1, 10), # 10 AD's
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(  # Virtanen et al 2002 Integrated classification...
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/virtanen.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/virtanen.cls", 
         column.subset = c(45,61,62,63,64,65,66,90),  # 8 SQ's 
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(  # Stanford: Garber et al 2001
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/stanford_no_nulls.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/stanford.3.cls",
         column.subset =    c(1,10,15,16,39,40,48,55,64,69), # 10 SQ's
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(  # Stanford: Garber et al 2001
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/stanford_no_nulls.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/stanford.3.cls",
         column.subset =    c(54,56,60,63,66), # 5 normals 
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(  # Meyerson AD cell lines
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/boston.plus.cell.lines.maxed.2.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/boston.plus.cell.lines.2.cls",                             
         column.subset = seq(1, 5), # 5 AD cell lines (CAD)
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(   # Virtanen et al 2002 Integrated classification...
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/virtanen.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/virtanen.cls", 
         column.subset = c(1,3,7,15,17,18,23,25,26,29,33), # 11 AD cell lines
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(  # NCI60 lung cell lines
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/NCI60.maxed.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/NCI60.maxed.cls",
         column.subset = seq(1, 9), # 9 Lung cell lines from NCI60
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(  # Akimura et al
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/GDS1688.maxed.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/GDS1688.maxed.cls",
         column.subset = seq(1, 10), # 10 Adeno cell lines
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(  # Virtanen et al 2002 Integrated classification...
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/virtanen.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/virtanen.cls",
         column.subset = c(5,19,20,21,22,31,35,37), # 8 SQ cell lines
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
      list(  # SQ cell lines
         gct.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/GDS1688.maxed.gct", 
         cls.file = "c:/CGP2006/MPM/Software_and_Examples/Lung/GDS1688.maxed.cls",
         column.subset =  seq(21, 29), # 9 SQ cell lines
         column.sel.type = "samples",  
         thres = "NULL",
         ceil = "NULL",
         norm = 6),
     )

# Define parameters for this specific run (see detailed definitions below)

identifier          <-    "Lung.1"
k.proj              <-           4
alg                 <-    "NMF.div"
niter               <-        2000
seed                <-        1789
nchar.phen          <-           3
postprojnorm        <-        TRUE
use.biplot          <-        TRUE
heatmap.row.norm    <-       FALSE
heatmap.cmap.type   <-           6
use.feature.names   <-       FALSE
high.conf.thres     <-         0.3
output.dir          <-     "c:/CGP2006/MPM/Software_and_Examples/Lung/"
kernel              <-     "radial"
cost                <-           1
gamma               <-           5
theta               <-        0.05
model.set.refinement <-          T

# These are the symbols and colors to use for each phenotype in the model and test sets 
#          model samples:   square symbols
#                  color         symbol      phenotype
legend.list <- c("blue",          22,        # AD-BOS
                 "red",           22,        # SQ-BOS
                 "green",         22,        # NL-BOS
#          test samples:    circles and other symbols
#                  color         symbol      phenotype
                 "blue",          21,        # AD-VIR
                 "blue",          23,        # AD-MIC
                 "blue",          24,        # AD-STA
                 "blue",          25,        # AD-JON
                 "red",           21,        # SQ-VIR
                 "red",           24,        # SQ-STA
                 "green",         24,        # NL-STA
                 "steelblue",     22,        # CAD-BOS
                 "steelblue",     21,        # CAD-VIR
                 "steelblue",     25,        # CAD-NCI60
                 "steelblue",     24,        # CAD-AKI
                 "darkred",       21,        # CSQ-VIR
                 "darkred",       24,        # CSQ-AKI
         )

symbol.scaling <- 0.55
col <- legend.list[seq(1, length(legend.list), 2)]
symbs <- as.numeric(legend.list[seq(2, length(legend.list), 2)])

# This is the call to the Metagene Projection function:

MetaGene.Projection(                           # Runs the entire methodology
                                               #   (except for the GSEA analysis of metagenes and the model (k) selection)
  model.dataset.table = model.dataset.table,   # R list with model dataset parameters (see model.dataset.table above)
  test.datasets.table = test.datasets.table,   # R list with test dataset(s) parameters (see model.dataset.table above)
  identifier = identifier,                     # Prefix to name output files
  k.proj = k.proj,                             # Number of metagenes in projection
  alg = alg,                                   # Algorithm for Metagene Projection (default NMF.div):
                                               #    "NMF.div" : Non-Negative Matrix Factorization using the divergence cost
                                               #  (other algorithms for projection are internally supported but have not being tested)
  niter = niter,                               # Number of algorithm iterations (default: 2000)
  seed = seed,                                 # Random seed to initialize metagene matrices (default: 1234)
  nchar.phen =  nchar.phen,                    # Number of characters to use to identify classes from the CLS files
  postprojnorm = postprojnorm,                 # TRUE or FALSE: apply post-projection normalization (i.e. scale points to unit
                                               #     hypersphere, default: T)
  heatmap.row.norm = heatmap.row.norm,         # TRUE or FALSE: row-normalize (standardize) the rows in the heat map (default F)
  heatmap.cmap.type = heatmap.cmap.type,       # 1 = vintage pinkogram, 2 = scale of grays, 4 = high-resolution pinkogram,
                                               #   6 = redish color map for metagene factors (default: 6)
  high.conf.thres = high.conf.thres,           # Confidence threshold (Brier score) to separate call from no-calls (default 0.3)
  output.dir = output.dir,                     # Output directory where the resulting output files will be produced
  col = col,                                   # Colors for the legend symbols for phenotypes: first model and then test dataset(s)
  symbs = symbs,                               # Plotting symbols for phenotypes: first model and then test dataset(s)
  symbol.scaling = symbol.scaling,             # Graphical scaling for symbols in plots and plot legends (default: 1)
  kernel = kernel,                             # Kernel function for SVM: "radial" or "linear" (default: "radial")
  cost = cost,                                 # Cost parameter for SVM (default: 1)
  gamma = gamma,                               # Gamma coefficient for radial base function kernel (default:  0.05 )
  model.set.refinement = model.set.refinement) # TRUE or FALSE: perform model set refinement (default: T)

# end of metagene projection script example

