diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 428c896e9..bb3cd82a1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.apache.poi.hpsf.Variant; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.utils.MathUtils; @@ -165,6 +166,23 @@ import java.util.*; * -o output.vcf \ * -fraction 0.5 * + * Select only indels from a VCF: + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T SelectVariants \ + * --variant input.vcf \ + * -o output.vcf \ + * -selectType INDEL + * + * Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column): + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T SelectVariants \ + * --variant input.vcf \ + * -o output.vcf \ + * -selectType SNP -selectType MNP \ + * -restrictAllelesTo MULTIALLELIC + * * * */ @@ -223,11 +241,17 @@ public class SelectVariants extends RodWalker { @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) private boolean EXCLUDE_FILTERED = false; - @Argument(fullName="excludeBiallelic", shortName="xl_biallelic", doc="Select only multi-allelic variants, excluding any biallelic one.", required=false) - private boolean EXCLUDE_BIALLELIC = false; - @Argument(fullName="selectMultiallelic", shortName="xl_multiallelic", doc="Select only biallelic variants, excluding any multi-allelic one.", required=false) - private boolean EXCLUDE_MULTIALLELIC = false; + /** + * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf. + * For example, a multiallelic record such as: + * 1 100 . A AAA,AAAAA + * will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as: + * 1 100 . A T + * will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC + */ + @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false) + private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL; @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; @@ -272,11 +296,14 @@ public class SelectVariants extends RodWalker { @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) private double fractionRandom = 0; - @Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false) - private boolean SELECT_SNPS = false; + /** + * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. + * When specified one or more times, a particular type of variant is selected. + * + */ + @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) + private List TYPES_TO_INCLUDE = new ArrayList(); - @Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false) - private boolean SELECT_INDELS = false; @Hidden @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) @@ -296,6 +323,13 @@ public class SelectVariants extends RodWalker { } + public enum NumberAlleleRestriction { + ALL, + BIALLELIC, + MULTIALLELIC + } + + private ArrayList selectedTypes = new ArrayList(); private ArrayList selectNames = new ArrayList(); private List jexls = null; @@ -360,6 +394,20 @@ public class SelectVariants extends RodWalker { for ( String sample : samples ) logger.info("Including sample '" + sample + "'"); + + + // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include + if (TYPES_TO_INCLUDE.isEmpty()) { + + for (VariantContext.Type t : VariantContext.Type.values()) + selectedTypes.add(t); + + } + else { + for (VariantContext.Type t : TYPES_TO_INCLUDE) + selectedTypes.add(t); + + } // Initialize VCF header Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); headerLines.add(new VCFHeaderLine("source", "SelectVariants")); @@ -499,18 +547,14 @@ public class SelectVariants extends RodWalker { if (!isConcordant(vc, compVCs)) return 0; } - if (EXCLUDE_BIALLELIC && vc.isBiallelic()) - return 0; - if (EXCLUDE_MULTIALLELIC && !vc.isBiallelic()) - return 0; - - // TODO - add ability to also select MNPs - // TODO - move variant selection arguments to the engine so other walkers can also do this - if (SELECT_INDELS && !(vc.isIndel() || vc.isMixed())) + if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic()) continue; - if (SELECT_SNPS && !vc.isSNP()) + if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic()) + continue; + + if (!selectedTypes.contains(vc.getType())) continue; VariantContext sub = subsetRecord(vc, samples); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 19db58e0c..7d0b4c3d4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -124,8 +124,12 @@ public class VariantsToTable extends RodWalker { * multi-allelic INFO field values can be lists of values. */ @Advanced - @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) - public boolean keepMultiAllelic = false; + @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) + public boolean keepMultiAllelic = false; + + @Hidden + @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false) + public boolean logACSum = false; /** * By default, this tool throws a UserException when it encounters a field without a value in some record. This @@ -150,7 +154,7 @@ public class VariantsToTable extends RodWalker { if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) { for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) { - List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA); + List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum); out.println(Utils.join("\t", vals)); } } @@ -176,9 +180,11 @@ public class VariantsToTable extends RodWalker { * @param fields a non-null list of fields to capture from VC * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise * provides a value of NA + * @param kma if true, multiallelic variants are to be kept + * @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count. * @return */ - public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { + private static List extractFields(VariantContext vc, List fields, boolean allowMissingData, boolean kma, boolean logsum) { List vals = new ArrayList(); for ( String field : fields ) { @@ -206,12 +212,31 @@ public class VariantsToTable extends RodWalker { } if (field.equals("AF") || field.equals("AC")) { - if (val.contains(",")) { - // strip [,] and spaces - val = val.replace("[",""); - val = val.replace("]",""); - val = val.replace(" ",""); - } + String afo = val; + + double af=0; + if (afo.contains(",")) { + String[] afs = afo.split(","); + afs[0] = afs[0].substring(1,afs[0].length()); + afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); + + double[] afd = new double[afs.length]; + + for (int k=0; k < afd.length; k++) + afd[k] = Double.valueOf(afs[k]); + + if (kma && logsum) + af = MathUtils.sum(afd); + else + af = MathUtils.arrayMax(afd); + //af = Double.valueOf(afs[0]); + + } + else + if (!afo.equals("NA")) + af = Double.valueOf(afo); + + val = Double.toString(af); } vals.add(val); @@ -220,6 +245,9 @@ public class VariantsToTable extends RodWalker { return vals; } + public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { + return extractFields(vc, fields, allowMissingData, false, false); + } // // default reduce -- doesn't do anything at all // diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index cec377f5f..20409d4ca 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -77,6 +77,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testConcordance--" + testFile, spec); } + @Test + public void testVariantTypeSelection() { + String testFile = validationDataLocation + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("e0b12c0b47a8a7a988b3587b47bfa8cf") + ); + + executeTest("testVariantTypeSelection--" + testFile, spec); + } + @Test(enabled=false) public void testRemovePLs() { String testFile = validationDataLocation + "combine.3.vcf"; diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 724518142..2417e5620 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -106,7 +106,7 @@ class DataProcessingPipeline extends QScript { // Because the realignment only happens after these scripts are executed, in case you are using // bwa realignment, this function will operate over the original bam files and output over the // (to be realigned) bam files. - def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = { + def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = { // Creating a table with SAMPLE information from each input BAM file val sampleTable = scala.collection.mutable.Map.empty[String, List[File]] @@ -131,24 +131,25 @@ class DataProcessingPipeline extends QScript { sampleTable(sample) :+= rBam } } + return sampleTable.toMap - println("\n\n*** INPUT FILES ***\n") - // Creating one file for each sample in the dataset - val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] - for ((sample, flist) <- sampleTable) { - - println(sample + ":") - for (f <- flist) - println (f) - println() - - val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list") - sampleBamFiles(sample) = sampleFileName - add(writeList(flist, sampleFileName)) - } - println("*** INPUT FILES ***\n\n") - - return sampleBamFiles.toMap +// println("\n\n*** INPUT FILES ***\n") +// // Creating one file for each sample in the dataset +// val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] +// for ((sample, flist) <- sampleTable) { +// +// println(sample + ":") +// for (f <- flist) +// println (f) +// println() +// +// val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list") +// sampleBamFiles(sample) = sampleFileName +// //add(writeList(flist, sampleFileName)) +// } +// println("*** INPUT FILES ***\n\n") +// +// return sampleBamFiles.toMap } // Rebuilds the Read Group string to give BWA @@ -224,7 +225,10 @@ class DataProcessingPipeline extends QScript { def script = { + // final output list of processed bam files + var cohortList: List[File] = List() + // sets the model for the Indel Realigner cleanModelEnum = getIndelCleaningModel() // keep a record of the number of contigs in the first bam file in the list @@ -233,28 +237,19 @@ class DataProcessingPipeline extends QScript { val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)} - // Generate a BAM file per sample joining all per lane files if necessary - val sampleBAMFiles: Map[String, File] = createSampleFiles(bams, realignedBAMs) + // generate a BAM file per sample joining all per lane files if necessary + val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs) - // Final output list of processed bam files - var cohortList: List[File] = List() - - // Simple progress report - println("\nFound the following samples: ") - for ((sample, file) <- sampleBAMFiles) - println("\t" + sample + " -> " + file) - println("\n") - - // If this is a 'knowns only' indel realignment run, do it only once for all samples. + // if this is a 'knowns only' indel realignment run, do it only once for all samples. val globalIntervals = new File(outputDir + projectName + ".intervals") if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY) add(target(null, globalIntervals)) - // Put each sample through the pipeline - for ((sample, sampleFile) <- sampleBAMFiles) { - val bam = if (sampleFile.endsWith(".list")) {swapExt(sampleFile, ".list", ".bam")} else {sampleFile} + // put each sample through the pipeline + for ((sample, bamList) <- sampleBAMFiles) { // BAM files generated by the pipeline + val bam = new File(qscript.projectName + "." + sample + ".bam") val cleanedBam = swapExt(bam, ".bam", ".clean.bam") val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam") val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam") @@ -272,15 +267,16 @@ class DataProcessingPipeline extends QScript { // Validation is an optional step for the BAM file generated after // alignment and the final bam file of the pipeline. - if (!noValidation && sampleFile.endsWith(".bam")) { // todo -- implement validation for .list BAM files + if (!noValidation) { + for (sampleFile <- bamList) add(validate(sampleFile, preValidateLog), validate(recalBam, postValidateLog)) } if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) - add(target(sampleFile, targetIntervals)) + add(target(bamList, targetIntervals)) - add(clean(sampleFile, targetIntervals, cleanedBam), + add(clean(bamList, targetIntervals, cleanedBam), dedup(cleanedBam, dedupedBam, metricsFile), cov(dedupedBam, preRecalFile), recal(dedupedBam, preRecalFile, recalBam), @@ -320,9 +316,9 @@ class DataProcessingPipeline extends QScript { this.maxRecordsInRam = 100000 } - case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { + case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY) - this.input_file :+= inBams + this.input_file = inBams this.out = outIntervals this.mismatchFraction = 0.0 this.known :+= qscript.dbSNP @@ -333,8 +329,8 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outIntervals + ".target" } - case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { - this.input_file :+= inBams + case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { + this.input_file = inBams this.targetIntervals = tIntervals this.out = outBam this.known :+= qscript.dbSNP diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index 9f3dd9a2c..accc5c7f0 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -145,7 +145,7 @@ class RecalibrateBaseQualities extends QScript { } case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.knownSites :+= dbSNP this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam this.recal_file = outRecalFile