From 8b7a0b3b6272164a68538a664d1d0dc9cac46055 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Tue, 23 Aug 2011 12:40:01 -0400
Subject: [PATCH 04/25] Two new arguments to SelectVariants to exclude either
 multiallelic or biallelic sites from input vcf

---
 .../gatk/walkers/variantutils/SelectVariants.java    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index bfe7198cf..ae798cf58 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -223,6 +223,12 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
     @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
     private boolean EXCLUDE_FILTERED = false;
 
+    @Argument(fullName="excludeBiallelic", shortName="xl_biallelic", doc="Select only multi-allelic variants, excluding any biallelic one.", required=false)
+    private boolean EXCLUDE_BIALLELIC = false;
+
+    @Argument(fullName="selectMultiallelic", shortName="xl_multiallelic", doc="Select only biallelic variants, excluding any multi-allelic one.", required=false)
+    private boolean EXCLUDE_MULTIALLELIC = false;
+
     @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
     private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
 
@@ -335,6 +341,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
         // first, add any requested samples
         samples.addAll(samplesFromFile);
         samples.addAll(samplesFromExpressions);
+        if (sampleNames != null)
         samples.addAll(sampleNames);
 
         // if none were requested, we want all of them
@@ -493,6 +500,11 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
                 if (!isConcordant(vc, compVCs))
                     return 0;
             }
+            if (EXCLUDE_BIALLELIC && vc.isBiallelic())
+                return 0;
+
+            if (EXCLUDE_MULTIALLELIC && !vc.isBiallelic())
+                return 0;
 
             // TODO - add ability to also select MNPs
             // TODO - move variant selection arguments to the engine so other walkers can also do this

From 6e2552a9efe411e650ea629f6a59c1142fd8b990 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Tue, 23 Aug 2011 12:40:43 -0400
Subject: [PATCH 05/25] Merge fix

---
 .../sting/gatk/walkers/variantutils/SelectVariants.java          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index ae798cf58..428c896e9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -341,7 +341,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
         // first, add any requested samples
         samples.addAll(samplesFromFile);
         samples.addAll(samplesFromExpressions);
-        if (sampleNames != null)
         samples.addAll(sampleNames);
 
         // if none were requested, we want all of them

From a1a1fac9e4f4fc8b8b4b07cf8603e26ac70b8c30 Mon Sep 17 00:00:00 2001
From: Ryan Poplin <rpoplin@broadinstitute.org>
Date: Tue, 23 Aug 2011 13:43:07 -0400
Subject: [PATCH 06/25] Likelihood engine now gives non-zero likelihoods. Using
 HMM function that can handle context specific gap open and gap continuation
 penalties

---
 .../gatk/walkers/genotyper/UnifiedArgumentCollection.java    | 2 +-
 .../sting/gatk/walkers/indels/PairHMMIndelErrorModel.java    | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index e7f89bf08..ae419a5c4 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -156,7 +156,7 @@ public class UnifiedArgumentCollection {
     public boolean OUTPUT_DEBUG_INDEL_INFO = false;
 
     @Hidden
-    @Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false)
+    @Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false)
     public boolean dovit = false;
 
     @Hidden
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
index 55450486b..2d7969230 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@@ -274,7 +274,7 @@ public class PairHMMIndelErrorModel {
         this.doViterbi = dovit;
     }
 
-    public  PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {
+    public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {
 
 
         this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
@@ -754,7 +754,7 @@ public class PairHMMIndelErrorModel {
 
             // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
             if (indelLikelihoodMap.containsKey(p)) {
-                HashMap<Allele,Double> el =  indelLikelihoodMap.get(p);
+                HashMap<Allele,Double> el = indelLikelihoodMap.get(p);
                 int j=0;
                 for (Allele a: haplotypeMap.keySet()) {
                     readLikelihoods[readIdx][j++] = el.get(a);
@@ -1055,7 +1055,6 @@ public class PairHMMIndelErrorModel {
             genotypeLikelihoods[i] -= maxElement;
 
         return genotypeLikelihoods;
-
     }
 
     /**

From 1ecbf05aae96096f2c075bf4c11095a620a2e9ec Mon Sep 17 00:00:00 2001
From: Khalid Shakir <kshakir@broadinstitute.org>
Date: Tue, 23 Aug 2011 23:49:36 -0400
Subject: [PATCH 08/25] Avoid segfaults due to out of date and possibly
 abandonded LSF DRMAA implementation when use'ing LSF instead of
 .combined_LSF_SGE

---
 .../drmaa/v1_0/LibDrmaaIntegrationTest.java   | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java
index ac2064640..d98281ad3 100644
--- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java
@@ -40,6 +40,7 @@ import java.util.Arrays;
 import java.util.List;
 
 public class LibDrmaaIntegrationTest extends BaseTest {
+    private String implementation = null;
 
     @Test
     public void testDrmaa() throws Exception {
@@ -79,10 +80,24 @@ public class LibDrmaaIntegrationTest extends BaseTest {
             Assert.fail(String.format("Could not get DRMAA implementation from the DRMAA library: %s", error.getString(0)));
 
         System.out.println(String.format("DRMAA implementation(s): %s", drmaaImplementation.getString(0)));
+
+        this.implementation = drmaaImplementation.getString(0);
     }
 
-    @Test
+    @Test(dependsOnMethods = { "testDrmaa" })
     public void testSubmitEcho() throws Exception {
+        if (implementation.indexOf("LSF") >= 0) {
+            System.err.println("    *********************************************************");
+            System.err.println("   ***********************************************************");
+            System.err.println("   ****                                                   ****");
+            System.err.println("  ****  Skipping LibDrmaaIntegrationTest.testSubmitEcho()  ****");
+            System.err.println("  ****     Are you using the dotkit .combined_LSF_SGE?     ****");
+            System.err.println("   ****                                                   ****");
+            System.err.println("   ***********************************************************");
+            System.err.println("    *********************************************************");
+            return;
+        }
+
         Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER);
         int errnum;
 
@@ -129,7 +144,7 @@ public class LibDrmaaIntegrationTest extends BaseTest {
             errnum = LibDrmaa.drmaa_set_vector_attribute(jt, LibDrmaa.DRMAA_V_ARGV, args, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN);
 
             if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS)
-                Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_REMOTE_COMMAND, error.getString(0)));
+                Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_V_ARGV, error.getString(0)));
 
             errnum = LibDrmaa.drmaa_run_job(jobIdMem, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, jt, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN);
 

From 219252a5661d7f00bff5237c040d55d813f9c24c Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Tue, 23 Aug 2011 15:38:19 -0400
Subject: [PATCH 09/25] Adapting to the new RodBinding framework

---
 .../sting/queue/qscripts/PacbioProcessingPipeline.scala         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
index 9f3dd9a2c..accc5c7f0 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
@@ -145,7 +145,7 @@ class RecalibrateBaseQualities extends QScript {
   }
 
   case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
-    this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
+    this.knownSites :+= dbSNP
     this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
     this.input_file :+= inBam
     this.recal_file = outRecalFile

From cd12f7f2865bd3237e3fe952f664859956aa61d1 Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Wed, 24 Aug 2011 11:10:40 -0400
Subject: [PATCH 10/25] Fixed list dependency

Instead of creating a bam list file, I dynamically create a scala list and pass as parameters. This way the intermediate bam files don't get deleted before they should.
---
 .../qscripts/DataProcessingPipeline.scala     | 76 +++++++++----------
 1 file changed, 36 insertions(+), 40 deletions(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
index 724518142..2417e5620 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
@@ -106,7 +106,7 @@ class DataProcessingPipeline extends QScript {
   // Because the realignment only happens after these scripts are executed, in case you are using
   // bwa realignment, this function will operate over the original bam files and output over the
   // (to be realigned) bam files.
-  def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = {
+  def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = {
 
     // Creating a table with SAMPLE information from each input BAM file
     val sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
@@ -131,24 +131,25 @@ class DataProcessingPipeline extends QScript {
           sampleTable(sample) :+= rBam
       }
     }
+    return sampleTable.toMap
 
-    println("\n\n*** INPUT FILES ***\n")
-    // Creating one file for each sample in the dataset
-    val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
-    for ((sample, flist) <- sampleTable) {
-
-      println(sample + ":")
-      for (f <- flist)
-        println (f)
-      println()
-
-      val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
-      sampleBamFiles(sample) = sampleFileName
-      add(writeList(flist, sampleFileName))
-    }
-    println("*** INPUT FILES ***\n\n")
-
-    return sampleBamFiles.toMap
+//    println("\n\n*** INPUT FILES ***\n")
+//    // Creating one file for each sample in the dataset
+//    val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
+//    for ((sample, flist) <- sampleTable) {
+//
+//      println(sample + ":")
+//      for (f <- flist)
+//        println (f)
+//      println()
+//
+//      val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
+//      sampleBamFiles(sample) = sampleFileName
+//      //add(writeList(flist, sampleFileName))
+//    }
+//    println("*** INPUT FILES ***\n\n")
+//
+//    return sampleBamFiles.toMap
   }
 
   // Rebuilds the Read Group string to give BWA
@@ -224,7 +225,10 @@ class DataProcessingPipeline extends QScript {
 
 
   def script = {
+    // final output list of processed bam files
+    var cohortList: List[File] = List()
 
+    // sets the model for the Indel Realigner
     cleanModelEnum = getIndelCleaningModel()
 
     // keep a record of the number of contigs in the first bam file in the list
@@ -233,28 +237,19 @@ class DataProcessingPipeline extends QScript {
 
     val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)}
 
-    // Generate a BAM file per sample joining all per lane files if necessary
-    val sampleBAMFiles: Map[String, File] = createSampleFiles(bams, realignedBAMs)
+    // generate a BAM file per sample joining all per lane files if necessary
+    val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs)
 
-    // Final output list of processed bam files
-    var cohortList: List[File] = List()
-
-    // Simple progress report
-    println("\nFound the following samples: ")
-    for ((sample, file) <- sampleBAMFiles)
-      println("\t" + sample + " -> " + file)
-    println("\n")
-
-    // If this is a 'knowns only' indel realignment run, do it only once for all samples.
+    // if this is a 'knowns only' indel realignment run, do it only once for all samples.
     val globalIntervals = new File(outputDir + projectName + ".intervals")
     if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY)
       add(target(null, globalIntervals))
 
-    // Put each sample through the pipeline
-    for ((sample, sampleFile) <- sampleBAMFiles) {
-      val bam = if (sampleFile.endsWith(".list")) {swapExt(sampleFile, ".list", ".bam")} else {sampleFile}
+    // put each sample through the pipeline
+    for ((sample, bamList) <- sampleBAMFiles) {
 
       // BAM files generated by the pipeline
+      val bam        = new File(qscript.projectName + "." + sample + ".bam")
       val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
       val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
       val recalBam   = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
@@ -272,15 +267,16 @@ class DataProcessingPipeline extends QScript {
 
       // Validation is an optional step for the BAM file generated after
       // alignment and the final bam file of the pipeline.
-      if (!noValidation && sampleFile.endsWith(".bam")) {        // todo -- implement validation for .list BAM files
+      if (!noValidation) {
+        for (sampleFile <- bamList)
         add(validate(sampleFile, preValidateLog),
             validate(recalBam, postValidateLog))
       }
 
       if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY)
-        add(target(sampleFile, targetIntervals))
+        add(target(bamList, targetIntervals))
 
-      add(clean(sampleFile, targetIntervals, cleanedBam),
+      add(clean(bamList, targetIntervals, cleanedBam),
           dedup(cleanedBam, dedupedBam, metricsFile),
           cov(dedupedBam, preRecalFile),
           recal(dedupedBam, preRecalFile, recalBam),
@@ -320,9 +316,9 @@ class DataProcessingPipeline extends QScript {
       this.maxRecordsInRam = 100000
   }
 
-  case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
+  case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
     if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY)
-      this.input_file :+= inBams
+      this.input_file = inBams
     this.out = outIntervals
     this.mismatchFraction = 0.0
     this.known :+= qscript.dbSNP
@@ -333,8 +329,8 @@ class DataProcessingPipeline extends QScript {
     this.jobName = queueLogDir + outIntervals + ".target"
   }
 
-  case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
-    this.input_file :+= inBams
+  case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
+    this.input_file = inBams
     this.targetIntervals = tIntervals
     this.out = outBam
     this.known :+= qscript.dbSNP

From e618cb1e7911e5cd12c43c5aaafc03a954f605a9 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Wed, 24 Aug 2011 12:25:50 -0400
Subject: [PATCH 11/25] a) Renamed/expanded SelectVariants arguments that
 choose particular kinds of variants and particular allelic types, now instead
 of -Indels or -SNPs we can specify for example -selectType
 [MIXED|INDEL|SNP|MNP|SYMBOLIC]. To select biallelic, multiallelic variants,
 use -restrictAllelesTo [BIALLELIC|MULTIALLELIC]. Corresponding gatkdocs
 changes. b) More useful AC,AF logging in VariantsToTable with multiallelic
 sites: instead of logging comma-separated values, log max value by default.
 Hidden, experimental argument -logACSum to log sum of ACs instead. This is
 due to extreme slowness of R in parsing strings to tokens and computing
 max/sum itself (~100x slower than gatk). c) Added integrationtest for new
 SelectVariants commands

---
 .../walkers/variantutils/SelectVariants.java  | 78 +++++++++++++++----
 .../walkers/variantutils/VariantsToTable.java | 48 +++++++++---
 .../SelectVariantsIntegrationTest.java        | 13 ++++
 3 files changed, 112 insertions(+), 27 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index 428c896e9..bb3cd82a1 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -24,6 +24,7 @@
 
 package org.broadinstitute.sting.gatk.walkers.variantutils;
 
+import org.apache.poi.hpsf.Variant;
 import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
 import org.broadinstitute.sting.utils.MathUtils;
@@ -165,6 +166,23 @@ import java.util.*;
  *   -o output.vcf \
  *   -fraction 0.5
  *
+ * Select only indels from a VCF:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -selectType INDEL
+ *
+ * Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -selectType SNP -selectType MNP \
+ *   -restrictAllelesTo MULTIALLELIC
+ *
  * </pre>
  *
  */
@@ -223,11 +241,17 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
     @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
     private boolean EXCLUDE_FILTERED = false;
 
-    @Argument(fullName="excludeBiallelic", shortName="xl_biallelic", doc="Select only multi-allelic variants, excluding any biallelic one.", required=false)
-    private boolean EXCLUDE_BIALLELIC = false;
 
-    @Argument(fullName="selectMultiallelic", shortName="xl_multiallelic", doc="Select only biallelic variants, excluding any multi-allelic one.", required=false)
-    private boolean EXCLUDE_MULTIALLELIC = false;
+    /**
+     * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
+     * For example, a multiallelic record such as:
+     * 1    100 .   A   AAA,AAAAA
+     * will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as:
+     * 1    100 .   A  T
+     * will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC
+     */
+    @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false)
+    private  NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL;
 
     @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
     private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
@@ -272,11 +296,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
     @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
     private double fractionRandom = 0;
 
-    @Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false)
-    private boolean SELECT_SNPS = false;
+    /**
+      * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
+     * When specified one or more times, a particular type of variant is selected.
+     *
+      */
+    @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
+    private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<VariantContext.Type>();
 
-    @Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false)
-    private boolean SELECT_INDELS = false;
 
     @Hidden
     @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
@@ -296,6 +323,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
 
     }
 
+    public enum NumberAlleleRestriction {
+           ALL,
+           BIALLELIC,
+           MULTIALLELIC
+    }
+
+    private ArrayList<VariantContext.Type> selectedTypes = new ArrayList<VariantContext.Type>();
     private ArrayList<String> selectNames = new ArrayList<String>();
     private List<VariantContextUtils.JexlVCMatchExp> jexls = null;
 
@@ -360,6 +394,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
         for ( String sample : samples )
             logger.info("Including sample '" + sample + "'");
 
+
+
+        // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
+        if (TYPES_TO_INCLUDE.isEmpty()) {
+
+            for (VariantContext.Type t : VariantContext.Type.values())
+                selectedTypes.add(t);
+
+        }
+        else {
+            for (VariantContext.Type t : TYPES_TO_INCLUDE)
+                selectedTypes.add(t);
+
+        }
         // Initialize VCF header
         Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
         headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
@@ -499,18 +547,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
                 if (!isConcordant(vc, compVCs))
                     return 0;
             }
-            if (EXCLUDE_BIALLELIC && vc.isBiallelic())
-                return 0;
 
-            if (EXCLUDE_MULTIALLELIC && !vc.isBiallelic())
-                return 0;
-
-            // TODO - add ability to also select MNPs
-            // TODO - move variant selection arguments to the engine so other walkers can also do this
-            if (SELECT_INDELS && !(vc.isIndel() || vc.isMixed()))
+            if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic())
                 continue;
 
-            if (SELECT_SNPS && !vc.isSNP())
+            if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic())
+                continue;
+
+            if (!selectedTypes.contains(vc.getType()))
                 continue;
 
             VariantContext sub = subsetRecord(vc, samples);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
index 19db58e0c..7d0b4c3d4 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@@ -124,8 +124,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
      * multi-allelic INFO field values can be lists of values.
      */
     @Advanced
-    @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
-    public boolean keepMultiAllelic = false;
+     @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
+     public boolean keepMultiAllelic = false;
+
+    @Hidden
+    @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false)
+     public boolean logACSum = false;
 
     /**
      * By default, this tool throws a UserException when it encounters a field without a value in some record.  This
@@ -150,7 +154,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
         if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) {
             for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
                 if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
-                    List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA);
+                    List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum);
                     out.println(Utils.join("\t", vals));
                 }
             }
@@ -176,9 +180,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
      * @param fields a non-null list of fields to capture from VC
      * @param allowMissingData if false, then throws a UserException if any field isn't found in vc.  Otherwise
      *   provides a value of NA
+     *   @param kma if true, multiallelic variants are to be kept
+     *   @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count.
      * @return
      */
-    public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
+    private static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean kma, boolean logsum) {
         List<String> vals = new ArrayList<String>();
 
         for ( String field : fields ) {
@@ -206,12 +212,31 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
             }
 
             if (field.equals("AF") || field.equals("AC")) {
-                      if (val.contains(",")) {
-                         // strip [,] and spaces
-                         val = val.replace("[","");
-                         val = val.replace("]","");
-                         val = val.replace(" ","");
-                        }
+                     String afo = val;
+
+                     double af=0;
+                     if (afo.contains(",")) {
+                         String[] afs = afo.split(",");
+                         afs[0] = afs[0].substring(1,afs[0].length());
+                         afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
+
+                         double[] afd = new double[afs.length];
+
+                         for (int k=0; k < afd.length; k++)
+                             afd[k] = Double.valueOf(afs[k]);
+
+                         if (kma && logsum)
+                             af = MathUtils.sum(afd);
+                         else
+                         af = MathUtils.arrayMax(afd);
+                         //af = Double.valueOf(afs[0]);
+
+                     }
+                     else
+                         if (!afo.equals("NA"))
+                             af = Double.valueOf(afo);
+
+                val = Double.toString(af);
 
             }
             vals.add(val);
@@ -220,6 +245,9 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
         return vals;
     }
 
+    public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
+        return extractFields(vc, fields, allowMissingData, false, false);
+    }
     //
     // default reduce -- doesn't do anything at all
     //
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
index cec377f5f..20409d4ca 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
@@ -77,6 +77,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
         executeTest("testConcordance--" + testFile, spec);
     }
 
+    @Test
+    public void testVariantTypeSelection() {
+        String testFile = validationDataLocation + "complexExample1.vcf";
+
+        WalkerTestSpec spec = new WalkerTestSpec(
+                "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s -NO_HEADER",
+                1,
+                Arrays.asList("e0b12c0b47a8a7a988b3587b47bfa8cf")
+        );
+
+        executeTest("testVariantTypeSelection--" + testFile, spec);
+    }
+
     @Test(enabled=false)
     public void testRemovePLs() {
         String testFile = validationDataLocation + "combine.3.vcf";

From e5008aba00a8c0d89a4622b2f66920b0f5484256 Mon Sep 17 00:00:00 2001
From: Ryan Poplin <rpoplin@broadinstitute.org>
Date: Wed, 24 Aug 2011 15:18:44 -0400
Subject: [PATCH 13/25] Output the top two haplotypes as a variant call by
 running smith-waterman alignment against the reference and calling any
 difference as variation. This is the first verion that runs end-to-end by
 taking in reads as bam file and writing out variant calls in VCF.

---
 .../sting/utils/variantcontext/VariantContext.java             | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index 888dc1e98..1fde8879b 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -209,6 +209,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
 
     /**
      * the complete constructor.  Makes a complete VariantContext from its arguments
+     * This is the only constructor that is able to create indels! DO NOT USE THE OTHER ONES.
      *
      * @param source          source
      * @param contig          the contig
@@ -293,7 +294,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
     }
 
     /**
-     * Create a new variant context without genotypes and no Perror, no filters, and no attributes
+     * Create a new variant context with genotypes but without Perror, filters, and attributes
      *
      * @param source          source
      * @param contig          the contig

From dc8398e165e6dcf138916249d4ce365375008248 Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Wed, 24 Aug 2011 15:58:34 -0400
Subject: [PATCH 15/25] fixing bai output for indel cleaning.

---
 .../sting/queue/qscripts/DataProcessingPipeline.scala            | 1 -
 1 file changed, 1 deletion(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
index 47ba0220f..116d16f35 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
@@ -313,7 +313,6 @@ class DataProcessingPipeline extends QScript {
   }
 
   case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
-    @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
     this.input_file :+= inBams
     this.targetIntervals = tIntervals
     this.out = outBam

From e3f5d7067a6da44660132459d97d351f3c9c2185 Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Wed, 24 Aug 2011 14:07:54 -0400
Subject: [PATCH 16/25] Added ReorderSam queue binding

---
 .../queue/extensions/picard/ReorderSam.scala  | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala

diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala
new file mode 100644
index 000000000..14b42ed5d
--- /dev/null
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala
@@ -0,0 +1,48 @@
+package org.broadinstitute.sting.queue.extensions.picard
+
+import org.broadinstitute.sting.commandline._
+
+import java.io.File
+/*
+ * Created by IntelliJ IDEA.
+ * User: carneiro
+ * Date: 6/22/11
+ * Time: 10:35 AM
+ */
+class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction {
+  analysisName = "ReorderSam"
+  javaMainClass = "net.sf.picard.sam.SortSam"
+
+  @Input(doc="Input file (bam or sam) to extract reads from.", shortName = "input", fullName = "input_bam_files", required = true)
+  var input: List[File] = Nil
+
+  @Output(doc="Output file (bam or sam) to write extracted reads to.", shortName = "output", fullName = "output_bam_file", required = true)
+  var output: File = _
+
+  @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false)
+  var outputIndex: File = _
+
+  @Argument(doc="Reference sequence to reorder reads to match.", shortName = "input", fullName = "input_bam_files", required = true)
+  var reference: File = _
+
+  @Argument(doc="If true, then allows only a partial overlap of the BAM contigs with the new reference sequence contigs. By default, this tool requires a corresponding contig in the new reference for each read contig.", shortName = "aic", fullName = "allow_incomplete_concordance", required = false)
+  var ALLOW_INCOMPLETE_DICT_CONCORDANCE: Boolean = false
+
+  @Argument(doc="If true, then permits mapping from a read contig to a new reference contig with the same name but a different length. Highly dangerous, only use if you know what you are doing.", shortName = "acld", fullName = "allow_contig_length_discordance", required = false)
+  var ALLOW_CONTIG_LENGTH_DISCORDANCE: Boolean = false
+
+  override def freezeFieldValues() {
+    super.freezeFieldValues()
+    if (outputIndex == null && output != null && output.toString.endsWith(".bam"))
+      outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
+  }
+
+  override def inputBams = input
+  override def outputBam = output
+  this.createIndex = Some(outputIndex != null)
+  this.sortOrder = null
+  override def commandLine = super.commandLine +
+    " REFERENCE=" + reference +
+    conditionalParameter (ALLOW_INCOMPLETE_DICT_CONCORDANCE, " ALLOW_INCOMPLETE_DICT_CONCORDANCE=true")
+    conditionalParameter (ALLOW_CONTIG_LENGTH_DISCORDANCE, " ALLOW_CONTIG_LENGTH_DISCORDANCE=true")
+}
\ No newline at end of file

From 16caca0822dfb08271f2ed30a31a8ff4c73da60f Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro <carneiro@broadinstitute.org>
Date: Wed, 24 Aug 2011 17:02:44 -0400
Subject: [PATCH 17/25] BLASR BAMs and new BWA parameters

*Added the functions to turn a BLASR generated BAM file into a usable BAM file.
*Modified the bwa parameters according to test results from NA12878 pb2k dataset.
---
 .../qscripts/PacbioProcessingPipeline.scala   | 51 ++++++++++++-------
 .../queue/extensions/picard/ReorderSam.scala  | 20 ++++----
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
index accc5c7f0..6947d4398 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
@@ -4,9 +4,9 @@ import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.util.QScriptUtils
 import net.sf.samtools.SAMFileHeader.SortOrder
-import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups}
 import org.broadinstitute.sting.utils.exceptions.UserException
 import org.broadinstitute.sting.commandline.Hidden
+import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups}
 
 /**
  * Created by IntelliJ IDEA.
@@ -16,7 +16,7 @@ import org.broadinstitute.sting.commandline.Hidden
  */
 
 
-class RecalibrateBaseQualities extends QScript {
+class PacbioProcessingPipeline extends QScript {
 
   @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true)
   var input: File = _
@@ -39,13 +39,16 @@ class RecalibrateBaseQualities extends QScript {
   @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
   var bwaPath: File = _
 
+  @Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false)
+  var BLASR_BAM: Boolean = false
+
   @Hidden
   @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
   var dbq: Int = 20
 
-
-
-
+  @Hidden
+  @Input(shortName="bwastring", required=false)
+  var bwastring: String = ""
 
   val queueLogDir: String = ".qlog/"
 
@@ -57,8 +60,6 @@ class RecalibrateBaseQualities extends QScript {
 
       var USE_BWA: Boolean = false
 
-      println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName)
-
       if (file.endsWith(".fasta") || file.endsWith(".fq")) {
         if (bwaPath == null) {
           throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA");
@@ -69,28 +70,34 @@ class RecalibrateBaseQualities extends QScript {
       // FASTA -> BAM steps
       val alignedSam: File = file.getName + ".aligned.sam"
       val sortedBam: File = swapExt(alignedSam, ".sam", ".bam")
-      val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam")
       val rgBam: File = swapExt(file, ".bam", ".rg.bam")
 
       val bamBase = if (USE_BWA) {rgBam} else {file}
 
       // BAM Steps
+      val mqBAM: File = swapExt(bamBase, ".bam", ".mq.bam")
       val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv")
       val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv")
       val recalBam: File   = swapExt(bamBase, ".bam", ".recal.bam")
       val path1: String    = recalBam + ".before"
       val path2: String    = recalBam + ".after"
 
-
       if (USE_BWA) {
         add(align(file, alignedSam),
             sortSam(alignedSam, sortedBam),
-            addQuals(sortedBam, qualBam, dbq),
-            addReadGroup(qualBam, rgBam, sample))
+            addReadGroup(sortedBam, rgBam, sample))
       }
 
-      add(cov(bamBase, recalFile1),
-          recal(bamBase, recalFile1, recalBam),
+      else if (BLASR_BAM) {
+        val reorderedBAM = swapExt(bamBase, ".bam", ".reordered.bam")
+        add(reorder(bamBase, reorderedBAM),
+            changeMQ(reorderedBAM, mqBAM))
+      }
+
+      val bam = if (BLASR_BAM) {mqBAM} else {bamBase}
+
+      add(cov(bam, recalFile1),
+          recal(bam, recalFile1, recalBam),
           cov(recalBam, recalFile2),
           analyzeCovariates(recalFile1, path1),
           analyzeCovariates(recalFile2, path2))
@@ -110,13 +117,13 @@ class RecalibrateBaseQualities extends QScript {
 
 
   case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs {
-    def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z10 -t8 " + reference + " " + inFastq + " > " + outSam
+    def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z20 -t16 " + reference + " " + inFastq + " > " + outSam
+    this.memoryLimit = 8
     this.analysisName = queueLogDir + outSam + ".bwa_sam_se"
     this.jobName = queueLogDir + outSam + ".bwa_sam_se"
   }
 
-  case class sortSam (@Input inSam: File, @Output outBam: File) extends SortSam with ExternalCommonArgs {
-    @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
+  case class sortSam (inSam: File, outBam: File) extends SortSam with ExternalCommonArgs {
     this.input = List(inSam)
     this.output = outBam
     this.sortOrder = SortOrder.coordinate
@@ -124,10 +131,16 @@ class RecalibrateBaseQualities extends QScript {
     this.jobName = queueLogDir + outBam + ".sortSam"
   }
 
-  case class addQuals(inBam: File, outBam: File, qual: Int) extends PrintReads with CommandLineGATKArgs {
+  case class reorder (inSam: File, outSam: File) extends ReorderSam with ExternalCommonArgs {
+    this.input = List(inSam)
+    this.output = outSam
+    this.sortReference = reference
+  }
+
+  case class changeMQ(inBam: File, outBam: File) extends PrintReads with CommandLineGATKArgs {
     this.input_file :+= inBam
     this.out = outBam
-    this.DBQ = qual
+    this.read_filter :+= "ReassignMappingQuality"
   }
 
   case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs {
@@ -145,6 +158,7 @@ class RecalibrateBaseQualities extends QScript {
   }
 
   case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
+    this.DBQ = dbq
     this.knownSites :+= dbSNP
     this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
     this.input_file :+= inBam
@@ -155,6 +169,7 @@ class RecalibrateBaseQualities extends QScript {
   }
 
   case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
+    this.DBQ = dbq
     this.input_file :+= inBam
     this.recal_file = inRecalFile
     this.out = outBam
diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala
index 14b42ed5d..72489dc87 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala
@@ -11,7 +11,7 @@ import java.io.File
  */
 class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction {
   analysisName = "ReorderSam"
-  javaMainClass = "net.sf.picard.sam.SortSam"
+  javaMainClass = "net.sf.picard.sam.ReorderSam"
 
   @Input(doc="Input file (bam or sam) to extract reads from.", shortName = "input", fullName = "input_bam_files", required = true)
   var input: List[File] = Nil
@@ -22,27 +22,27 @@ class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLine
   @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false)
   var outputIndex: File = _
 
-  @Argument(doc="Reference sequence to reorder reads to match.", shortName = "input", fullName = "input_bam_files", required = true)
-  var reference: File = _
+  @Argument(doc="Reference sequence to reorder reads to match.", shortName = "ref", fullName = "sort_reference", required = true)
+  var sortReference: File = _
 
   @Argument(doc="If true, then allows only a partial overlap of the BAM contigs with the new reference sequence contigs. By default, this tool requires a corresponding contig in the new reference for each read contig.", shortName = "aic", fullName = "allow_incomplete_concordance", required = false)
-  var ALLOW_INCOMPLETE_DICT_CONCORDANCE: Boolean = false
+  var ALLOW_INCOMPLETE_DICT_CONCORDANCE: Boolean = _
 
   @Argument(doc="If true, then permits mapping from a read contig to a new reference contig with the same name but a different length. Highly dangerous, only use if you know what you are doing.", shortName = "acld", fullName = "allow_contig_length_discordance", required = false)
-  var ALLOW_CONTIG_LENGTH_DISCORDANCE: Boolean = false
+  var ALLOW_CONTIG_LENGTH_DISCORDANCE: Boolean = _
 
   override def freezeFieldValues() {
     super.freezeFieldValues()
-    if (outputIndex == null && output != null && output.toString.endsWith(".bam"))
+    if (outputIndex == null && output != null)
       outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
   }
 
   override def inputBams = input
   override def outputBam = output
-  this.createIndex = Some(outputIndex != null)
+  this.createIndex = Some(true)
   this.sortOrder = null
   override def commandLine = super.commandLine +
-    " REFERENCE=" + reference +
-    conditionalParameter (ALLOW_INCOMPLETE_DICT_CONCORDANCE, " ALLOW_INCOMPLETE_DICT_CONCORDANCE=true")
-    conditionalParameter (ALLOW_CONTIG_LENGTH_DISCORDANCE, " ALLOW_CONTIG_LENGTH_DISCORDANCE=true")
+    " REFERENCE=" + sortReference +
+    optional(" ALLOW_INCOMPLETE_DICT_CONCORDANCE=", ALLOW_INCOMPLETE_DICT_CONCORDANCE)
+    optional(" ALLOW_CONTIG_LENGTH_DISCORDANCE=", ALLOW_CONTIG_LENGTH_DISCORDANCE)
 }
\ No newline at end of file

From 8bbef79fc275be0f250b2eaf8fdf6d2892b985a1 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Thu, 25 Aug 2011 15:37:26 -0400
Subject: [PATCH 22/25] Create clipped alleles during allele parsing instead of
 creating a full VC, clipping alleles, and regenerating the VC from scratch.

---
 .../utils/codecs/vcf/AbstractVCFCodec.java    | 143 +++++-------------
 .../utils/variantcontext/VariantContext.java  |   5 +-
 .../variantcontext/VariantContextUtils.java   |  74 ++++++++-
 3 files changed, 111 insertions(+), 111 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
index a3100030e..3a4ca2478 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@@ -281,7 +281,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
 
         VariantContext vc = null;
         try {
-            vc =  new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes);
+            vc =  new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]);
         } catch (Exception e) {
             generateException(e.getMessage());
         }
@@ -291,7 +291,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
             vc.getGenotypes();
 
         // Trim bases of all alleles if necessary
-        return createVariantContextWithTrimmedAlleles(vc);
+        return vc;
     }
 
     /**
@@ -516,25 +516,44 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
         return true;
     }
 
-    private static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
+    public static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
         boolean clipping = true;
-        // Note that the computation of forward clipping here is meant only to see whether there is a common
-        // base to all alleles, and to correctly compute reverse clipping,
-        // but it is not used for actually changing alleles - this is done in function
-        // createVariantContextWithTrimmedAlleles() below.
 
-        for (Allele a : unclippedAlleles) {
-            if (a.isSymbolic()) {
+        for ( Allele a : unclippedAlleles ) {
+            if ( a.isSymbolic() )
                 continue;
-            }
-            if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
+
+            if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) {
                 clipping = false;
+                break;
             }
         }
-        return (clipping) ? 1 : 0;
 
+        return (clipping) ? 1 : 0;
     }
 
+    protected static int computeReverseClipping(List<Allele> unclippedAlleles, String ref, int forwardClipping, int lineNo) {
+        int clipping = 0;
+        boolean stillClipping = true;
+
+        while ( stillClipping ) {
+            for ( Allele a : unclippedAlleles ) {
+                if ( a.isSymbolic() )
+                    continue;
+
+                if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 )
+                    stillClipping = false;
+                else if ( ref.length() == clipping )
+                    generateException("bad alleles encountered", lineNo);
+                else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] )
+                    stillClipping = false;
+            }
+            if ( stillClipping )
+                clipping++;
+        }
+
+        return clipping;
+    }
     /**
      * clip the alleles, based on the reference
      *
@@ -542,122 +561,30 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
      * @param ref the reference string
      * @param unclippedAlleles the list of unclipped alleles
      * @param clippedAlleles output list of clipped alleles
+     * @param lineNo the current line number in the file
      * @return the new reference end position of this event
      */
     protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
 
-        // Note that the computation of forward clipping here is meant only to see whether there is a common
-        // base to all alleles, and to correctly compute reverse clipping,
-        // but it is not used for actually changing alleles - this is done in function
-        // createVariantContextWithTrimmedAlleles() below.
-
         int forwardClipping = computeForwardClipping(unclippedAlleles, ref);
-
-        int reverseClipped = 0;
-        boolean clipping = true;
-        while (clipping) {
-            for (Allele a : unclippedAlleles) {
-                if (a.isSymbolic()) {
-                    continue;
-                }
-                if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
-                    clipping = false;
-                else if (ref.length() == reverseClipped)
-                    generateException("bad alleles encountered", lineNo);
-                else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
-                    clipping = false;
-            }
-            if (clipping) reverseClipped++;
-        }
+        int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo);
 
         if ( clippedAlleles != null ) {
             for ( Allele a : unclippedAlleles ) {
                 if ( a.isSymbolic() ) {
                     clippedAlleles.add(a);
                 } else {
-                    clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference()));
+                    clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping), a.isReference()));
                 }
             }
         }
 
         // the new reference length
-        int refLength = ref.length() - reverseClipped;
+        int refLength = ref.length() - reverseClipping;
 
         return position+Math.max(refLength - 1,0);
     }
 
-    public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
-        // see if we need to trim common reference base from all alleles
-        boolean trimVC;
-
-        // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles
-        Allele refAllele = inputVC.getReference();
-        if (!inputVC.isVariant())
-            trimVC = false;
-        else if (refAllele.isNull())
-            trimVC = false;
-        else {
-            trimVC = (computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()),
-                    inputVC.getReference().getDisplayString()) > 0);
-         }
-
-        // nothing to do if we don't need to trim bases
-        if (trimVC) {
-            List<Allele> alleles = new ArrayList<Allele>();
-            Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
-
-            // set the reference base for indels in the attributes
-            Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
-
-            Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
-
-            for (Allele a : inputVC.getAlleles()) {
-                if (a.isSymbolic()) {
-                    alleles.add(a);
-                    originalToTrimmedAlleleMap.put(a, a);
-                } else {
-                    // get bases for current allele and create a new one with trimmed bases
-                    byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length());
-                    Allele trimmedAllele = Allele.create(newBases, a.isReference());
-                    alleles.add(trimmedAllele);
-                    originalToTrimmedAlleleMap.put(a, trimmedAllele);
-                }
-            }
-
-            // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation
-            // example: mixed records such as {TA*,TGA,TG}
-            boolean hasNullAlleles = false;
-
-            for (Allele a: originalToTrimmedAlleleMap.values()) {
-                if (a.isNull())
-                    hasNullAlleles = true;
-                if (a.isReference())
-                    refAllele = a;
-             }
-
-             if (!hasNullAlleles)
-               return inputVC;
-           // now we can recreate new genotypes with trimmed alleles
-            for ( Map.Entry<String, Genotype> sample : inputVC.getGenotypes().entrySet() ) {
-
-                List<Allele> originalAlleles = sample.getValue().getAlleles();
-                List<Allele> trimmedAlleles = new ArrayList<Allele>();
-                for ( Allele a : originalAlleles ) {
-                    if ( a.isCalled() )
-                        trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
-                    else
-                        trimmedAlleles.add(Allele.NO_CALL);
-                }
-                genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles));
-
-            }
-            return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0]));
-
-        }
-
-        return inputVC;
-    }
-
     public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) {
         try {
             return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index 1fde8879b..673fe4529 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -258,9 +258,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati
      * @param negLog10PError  qual
      * @param filters         filters: use null for unfiltered and empty set for passes filters
      * @param attributes      attributes
+     * @param referenceBaseForIndel   padded reference base
      */
-    public VariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
-        this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, null, true);
+    public VariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, Byte referenceBaseForIndel) {
+        this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true);
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
index 834ad0917..986d6305c 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@@ -657,12 +657,84 @@ public class VariantContextUtils {
 
         VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) );
         // Trim the padded bases of all alleles if necessary
-        merged = AbstractVCFCodec.createVariantContextWithTrimmedAlleles(merged);
+        merged = createVariantContextWithTrimmedAlleles(merged);
 
         if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
         return merged;
     }
 
+    public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
+        // see if we need to trim common reference base from all alleles
+        boolean trimVC;
+
+        // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles
+        Allele refAllele = inputVC.getReference();
+        if (!inputVC.isVariant())
+            trimVC = false;
+        else if (refAllele.isNull())
+            trimVC = false;
+        else {
+            trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()),
+                    inputVC.getReference().getDisplayString()) > 0);
+         }
+
+        // nothing to do if we don't need to trim bases
+        if (trimVC) {
+            List<Allele> alleles = new ArrayList<Allele>();
+            Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
+
+            // set the reference base for indels in the attributes
+            Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
+
+            Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
+
+            for (Allele a : inputVC.getAlleles()) {
+                if (a.isSymbolic()) {
+                    alleles.add(a);
+                    originalToTrimmedAlleleMap.put(a, a);
+                } else {
+                    // get bases for current allele and create a new one with trimmed bases
+                    byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length());
+                    Allele trimmedAllele = Allele.create(newBases, a.isReference());
+                    alleles.add(trimmedAllele);
+                    originalToTrimmedAlleleMap.put(a, trimmedAllele);
+                }
+            }
+
+            // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation
+            // example: mixed records such as {TA*,TGA,TG}
+            boolean hasNullAlleles = false;
+
+            for (Allele a: originalToTrimmedAlleleMap.values()) {
+                if (a.isNull())
+                    hasNullAlleles = true;
+                if (a.isReference())
+                    refAllele = a;
+             }
+
+             if (!hasNullAlleles)
+               return inputVC;
+           // now we can recreate new genotypes with trimmed alleles
+            for ( Map.Entry<String, Genotype> sample : inputVC.getGenotypes().entrySet() ) {
+
+                List<Allele> originalAlleles = sample.getValue().getAlleles();
+                List<Allele> trimmedAlleles = new ArrayList<Allele>();
+                for ( Allele a : originalAlleles ) {
+                    if ( a.isCalled() )
+                        trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
+                    else
+                        trimmedAlleles.add(Allele.NO_CALL);
+                }
+                genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles));
+
+            }
+            return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0]));
+
+        }
+
+        return inputVC;
+    }
+
     public static Map<String, Genotype> stripPLs(Map<String, Genotype> genotypes) {
         Map<String, Genotype> newGs = new HashMap<String, Genotype>(genotypes.size());
 

From 09a729da3aba44d319c603a93058e902fb7a1aa0 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Thu, 25 Aug 2011 15:42:52 -0400
Subject: [PATCH 23/25] Removing incorrect comment

---
 .../broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
index 3a4ca2478..bb212e128 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@@ -290,7 +290,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
         if ( !header.samplesWereAlreadySorted() )
             vc.getGenotypes();
 
-        // Trim bases of all alleles if necessary
         return vc;
     }
 

From 9b7512fd94f532d18b6793d2c81b03c2a306656c Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Thu, 25 Aug 2011 22:42:14 -0400
Subject: [PATCH 25/25] Just because there's a ref base doesn't mean the VC
 needs to be padded

---
 .../sting/gatk/walkers/variantutils/VariantsToTable.java      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
index 7d0b4c3d4..0cf425e6f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@@ -271,7 +271,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
         getters.put("REF", new Getter() {
             public String get(VariantContext vc) {
                 String x = "";
-                if ( vc.hasReferenceBaseForIndel() ) {
+                if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
                     Byte refByte = vc.getReferenceBaseForIndel();
                     x=x+new String(new byte[]{refByte});
                 }
@@ -283,7 +283,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
                 StringBuilder x = new StringBuilder();
                 int n = vc.getAlternateAlleles().size();
                 if ( n == 0 ) return ".";
-                if ( vc.hasReferenceBaseForIndel() ) {
+                if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
                     Byte refByte = vc.getReferenceBaseForIndel();
                     x.append(new String(new byte[]{refByte}));
                 }