Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2011-08-24 15:18:58 -04:00 · 2011-08-24 15:18:58 -04:00 · 29c7b10f7b
parent e5008aba00 61f5968807
commit 29c7b10f7b
5 changed files with 149 additions and 68 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@ -24,6 +24,7 @@

 package org.broadinstitute.sting.gatk.walkers.variantutils;

+import org.apache.poi.hpsf.Variant;
 import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
 import org.broadinstitute.sting.utils.MathUtils;
@ -165,6 +166,23 @@ import java.util.*;
 *   -o output.vcf \
 *   -fraction 0.5
 *
+ * Select only indels from a VCF:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -selectType INDEL
+ *
+ * Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -selectType SNP -selectType MNP \
+ *   -restrictAllelesTo MULTIALLELIC
+ *
 * </pre>
 *
 */
@ -223,11 +241,17 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
    @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
    private boolean EXCLUDE_FILTERED = false;

-    @Argument(fullName="excludeBiallelic", shortName="xl_biallelic", doc="Select only multi-allelic variants, excluding any biallelic one.", required=false)
-    private boolean EXCLUDE_BIALLELIC = false;

-    @Argument(fullName="selectMultiallelic", shortName="xl_multiallelic", doc="Select only biallelic variants, excluding any multi-allelic one.", required=false)
-    private boolean EXCLUDE_MULTIALLELIC = false;
+    /**
+     * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
+     * For example, a multiallelic record such as:
+     * 1    100 .   A   AAA,AAAAA
+     * will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as:
+     * 1    100 .   A  T
+     * will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC
+     */
+    @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false)
+    private  NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL;

    @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
    private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
@ -272,11 +296,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
    @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
    private double fractionRandom = 0;

-    @Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false)
-    private boolean SELECT_SNPS = false;
+    /**
+      * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
+     * When specified one or more times, a particular type of variant is selected.
+     *
+      */
+    @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
+    private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<VariantContext.Type>();

-    @Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false)
-    private boolean SELECT_INDELS = false;

    @Hidden
    @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
@ -296,6 +323,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {

    }

+    public enum NumberAlleleRestriction {
+           ALL,
+           BIALLELIC,
+           MULTIALLELIC
+    }
+
+    private ArrayList<VariantContext.Type> selectedTypes = new ArrayList<VariantContext.Type>();
    private ArrayList<String> selectNames = new ArrayList<String>();
    private List<VariantContextUtils.JexlVCMatchExp> jexls = null;

@ -360,6 +394,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
        for ( String sample : samples )
            logger.info("Including sample '" + sample + "'");

+
+
+        // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
+        if (TYPES_TO_INCLUDE.isEmpty()) {
+
+            for (VariantContext.Type t : VariantContext.Type.values())
+                selectedTypes.add(t);
+
+        }
+        else {
+            for (VariantContext.Type t : TYPES_TO_INCLUDE)
+                selectedTypes.add(t);
+
+        }
        // Initialize VCF header
        Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
        headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
@ -499,18 +547,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
                if (!isConcordant(vc, compVCs))
                    return 0;
            }
-            if (EXCLUDE_BIALLELIC && vc.isBiallelic())
-                return 0;

-            if (EXCLUDE_MULTIALLELIC && !vc.isBiallelic())
-                return 0;
-
-            // TODO - add ability to also select MNPs
-            // TODO - move variant selection arguments to the engine so other walkers can also do this
-            if (SELECT_INDELS && !(vc.isIndel() || vc.isMixed()))
+            if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic())
                continue;

-            if (SELECT_SNPS && !vc.isSNP())
+            if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic())
+                continue;
+
+            if (!selectedTypes.contains(vc.getType()))
                continue;

            VariantContext sub = subsetRecord(vc, samples);
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@ -124,8 +124,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
     * multi-allelic INFO field values can be lists of values.
     */
    @Advanced
-    @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
-    public boolean keepMultiAllelic = false;
+     @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
+     public boolean keepMultiAllelic = false;
+
+    @Hidden
+    @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false)
+     public boolean logACSum = false;

    /**
     * By default, this tool throws a UserException when it encounters a field without a value in some record.  This
@ -150,7 +154,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
        if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) {
            for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
                if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
-                    List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA);
+                    List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum);
                    out.println(Utils.join("\t", vals));
                }
            }
@ -176,9 +180,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
     * @param fields a non-null list of fields to capture from VC
     * @param allowMissingData if false, then throws a UserException if any field isn't found in vc.  Otherwise
     *   provides a value of NA
+     *   @param kma if true, multiallelic variants are to be kept
+     *   @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count.
     * @return
     */
-    public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
+    private static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean kma, boolean logsum) {
        List<String> vals = new ArrayList<String>();

        for ( String field : fields ) {
@ -206,12 +212,31 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
            }

            if (field.equals("AF") || field.equals("AC")) {
-                      if (val.contains(",")) {
-                         // strip [,] and spaces
-                         val = val.replace("[","");
-                         val = val.replace("]","");
-                         val = val.replace(" ","");
-                        }
+                     String afo = val;
+
+                     double af=0;
+                     if (afo.contains(",")) {
+                         String[] afs = afo.split(",");
+                         afs[0] = afs[0].substring(1,afs[0].length());
+                         afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
+
+                         double[] afd = new double[afs.length];
+
+                         for (int k=0; k < afd.length; k++)
+                             afd[k] = Double.valueOf(afs[k]);
+
+                         if (kma && logsum)
+                             af = MathUtils.sum(afd);
+                         else
+                         af = MathUtils.arrayMax(afd);
+                         //af = Double.valueOf(afs[0]);
+
+                     }
+                     else
+                         if (!afo.equals("NA"))
+                             af = Double.valueOf(afo);
+
+                val = Double.toString(af);

            }
            vals.add(val);
@ -220,6 +245,9 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
        return vals;
    }

+    public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
+        return extractFields(vc, fields, allowMissingData, false, false);
+    }
    //
    // default reduce -- doesn't do anything at all
    //
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
@ -77,6 +77,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        executeTest("testConcordance--" + testFile, spec);
    }

+    @Test
+    public void testVariantTypeSelection() {
+        String testFile = validationDataLocation + "complexExample1.vcf";
+
+        WalkerTestSpec spec = new WalkerTestSpec(
+                "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s -NO_HEADER",
+                1,
+                Arrays.asList("e0b12c0b47a8a7a988b3587b47bfa8cf")
+        );
+
+        executeTest("testVariantTypeSelection--" + testFile, spec);
+    }
+
    @Test(enabled=false)
    public void testRemovePLs() {
        String testFile = validationDataLocation + "combine.3.vcf";
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
@ -106,7 +106,7 @@ class DataProcessingPipeline extends QScript {
  // Because the realignment only happens after these scripts are executed, in case you are using
  // bwa realignment, this function will operate over the original bam files and output over the
  // (to be realigned) bam files.
-  def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = {
+  def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = {

    // Creating a table with SAMPLE information from each input BAM file
    val sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
@ -131,24 +131,25 @@ class DataProcessingPipeline extends QScript {
          sampleTable(sample) :+= rBam
      }
    }
+    return sampleTable.toMap

-    println("\n\n*** INPUT FILES ***\n")
-    // Creating one file for each sample in the dataset
-    val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
-    for ((sample, flist) <- sampleTable) {
-
-      println(sample + ":")
-      for (f <- flist)
-        println (f)
-      println()
-
-      val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
-      sampleBamFiles(sample) = sampleFileName
-      add(writeList(flist, sampleFileName))
-    }
-    println("*** INPUT FILES ***\n\n")
-
-    return sampleBamFiles.toMap
+//    println("\n\n*** INPUT FILES ***\n")
+//    // Creating one file for each sample in the dataset
+//    val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
+//    for ((sample, flist) <- sampleTable) {
+//
+//      println(sample + ":")
+//      for (f <- flist)
+//        println (f)
+//      println()
+//
+//      val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
+//      sampleBamFiles(sample) = sampleFileName
+//      //add(writeList(flist, sampleFileName))
+//    }
+//    println("*** INPUT FILES ***\n\n")
+//
+//    return sampleBamFiles.toMap
  }

  // Rebuilds the Read Group string to give BWA
@ -224,7 +225,10 @@ class DataProcessingPipeline extends QScript {


  def script = {
+    // final output list of processed bam files
+    var cohortList: List[File] = List()

+    // sets the model for the Indel Realigner
    cleanModelEnum = getIndelCleaningModel()

    // keep a record of the number of contigs in the first bam file in the list
@ -233,28 +237,19 @@ class DataProcessingPipeline extends QScript {

    val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)}

-    // Generate a BAM file per sample joining all per lane files if necessary
-    val sampleBAMFiles: Map[String, File] = createSampleFiles(bams, realignedBAMs)
+    // generate a BAM file per sample joining all per lane files if necessary
+    val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs)

-    // Final output list of processed bam files
-    var cohortList: List[File] = List()
-
-    // Simple progress report
-    println("\nFound the following samples: ")
-    for ((sample, file) <- sampleBAMFiles)
-      println("\t" + sample + " -> " + file)
-    println("\n")
-
-    // If this is a 'knowns only' indel realignment run, do it only once for all samples.
+    // if this is a 'knowns only' indel realignment run, do it only once for all samples.
    val globalIntervals = new File(outputDir + projectName + ".intervals")
    if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY)
      add(target(null, globalIntervals))

-    // Put each sample through the pipeline
-    for ((sample, sampleFile) <- sampleBAMFiles) {
-      val bam = if (sampleFile.endsWith(".list")) {swapExt(sampleFile, ".list", ".bam")} else {sampleFile}
+    // put each sample through the pipeline
+    for ((sample, bamList) <- sampleBAMFiles) {

      // BAM files generated by the pipeline
+      val bam        = new File(qscript.projectName + "." + sample + ".bam")
      val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
      val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
      val recalBam   = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
@ -272,15 +267,16 @@ class DataProcessingPipeline extends QScript {

      // Validation is an optional step for the BAM file generated after
      // alignment and the final bam file of the pipeline.
-      if (!noValidation && sampleFile.endsWith(".bam")) {        // todo -- implement validation for .list BAM files
+      if (!noValidation) {
+        for (sampleFile <- bamList)
        add(validate(sampleFile, preValidateLog),
            validate(recalBam, postValidateLog))
      }

      if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY)
-        add(target(sampleFile, targetIntervals))
+        add(target(bamList, targetIntervals))

-      add(clean(sampleFile, targetIntervals, cleanedBam),
+      add(clean(bamList, targetIntervals, cleanedBam),
          dedup(cleanedBam, dedupedBam, metricsFile),
          cov(dedupedBam, preRecalFile),
          recal(dedupedBam, preRecalFile, recalBam),
@ -320,9 +316,9 @@ class DataProcessingPipeline extends QScript {
      this.maxRecordsInRam = 100000
  }

-  case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
+  case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
    if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY)
-      this.input_file :+= inBams
+      this.input_file = inBams
    this.out = outIntervals
    this.mismatchFraction = 0.0
    this.known :+= qscript.dbSNP
@ -333,8 +329,8 @@ class DataProcessingPipeline extends QScript {
    this.jobName = queueLogDir + outIntervals + ".target"
  }

-  case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
-    this.input_file :+= inBams
+  case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
+    this.input_file = inBams
    this.targetIntervals = tIntervals
    this.out = outBam
    this.known :+= qscript.dbSNP
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
@ -145,7 +145,7 @@ class RecalibrateBaseQualities extends QScript {
  }

  case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
-    this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
+    this.knownSites :+= dbSNP
    this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
    this.input_file :+= inBam
    this.recal_file = outRecalFile