From 8b7a0b3b6272164a68538a664d1d0dc9cac46055 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 23 Aug 2011 12:40:01 -0400 Subject: [PATCH 04/25] Two new arguments to SelectVariants to exclude either multiallelic or biallelic sites from input vcf --- .../gatk/walkers/variantutils/SelectVariants.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index bfe7198cf..ae798cf58 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -223,6 +223,12 @@ public class SelectVariants extends RodWalker { @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) private boolean EXCLUDE_FILTERED = false; + @Argument(fullName="excludeBiallelic", shortName="xl_biallelic", doc="Select only multi-allelic variants, excluding any biallelic one.", required=false) + private boolean EXCLUDE_BIALLELIC = false; + + @Argument(fullName="selectMultiallelic", shortName="xl_multiallelic", doc="Select only biallelic variants, excluding any multi-allelic one.", required=false) + private boolean EXCLUDE_MULTIALLELIC = false; + @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; @@ -335,6 +341,7 @@ public class SelectVariants extends RodWalker { // first, add any requested samples samples.addAll(samplesFromFile); samples.addAll(samplesFromExpressions); + if (sampleNames != null) samples.addAll(sampleNames); // if none were requested, we want all of them @@ -493,6 +500,11 @@ public class SelectVariants extends RodWalker { if (!isConcordant(vc, compVCs)) return 0; } + if (EXCLUDE_BIALLELIC && vc.isBiallelic()) + return 0; + + if (EXCLUDE_MULTIALLELIC && !vc.isBiallelic()) + return 0; // TODO - add ability to also select MNPs // TODO - move variant selection arguments to the engine so other walkers can also do this From 6e2552a9efe411e650ea629f6a59c1142fd8b990 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 23 Aug 2011 12:40:43 -0400 Subject: [PATCH 05/25] Merge fix --- .../sting/gatk/walkers/variantutils/SelectVariants.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index ae798cf58..428c896e9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -341,7 +341,6 @@ public class SelectVariants extends RodWalker { // first, add any requested samples samples.addAll(samplesFromFile); samples.addAll(samplesFromExpressions); - if (sampleNames != null) samples.addAll(sampleNames); // if none were requested, we want all of them From a1a1fac9e4f4fc8b8b4b07cf8603e26ac70b8c30 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 23 Aug 2011 13:43:07 -0400 Subject: [PATCH 06/25] Likelihood engine now gives non-zero likelihoods. Using HMM function that can handle context specific gap open and gap continuation penalties --- .../gatk/walkers/genotyper/UnifiedArgumentCollection.java | 2 +- .../sting/gatk/walkers/indels/PairHMMIndelErrorModel.java | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index e7f89bf08..ae419a5c4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -156,7 +156,7 @@ public class UnifiedArgumentCollection { public boolean OUTPUT_DEBUG_INDEL_INFO = false; @Hidden - @Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false) + @Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false) public boolean dovit = false; @Hidden diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 55450486b..2d7969230 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -274,7 +274,7 @@ public class PairHMMIndelErrorModel { this.doViterbi = dovit; } - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) { + public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) { this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob @@ -754,7 +754,7 @@ public class PairHMMIndelErrorModel { // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) if (indelLikelihoodMap.containsKey(p)) { - HashMap el = indelLikelihoodMap.get(p); + HashMap el = indelLikelihoodMap.get(p); int j=0; for (Allele a: haplotypeMap.keySet()) { readLikelihoods[readIdx][j++] = el.get(a); @@ -1055,7 +1055,6 @@ public class PairHMMIndelErrorModel { genotypeLikelihoods[i] -= maxElement; return genotypeLikelihoods; - } /** From 1ecbf05aae96096f2c075bf4c11095a620a2e9ec Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Tue, 23 Aug 2011 23:49:36 -0400 Subject: [PATCH 08/25] Avoid segfaults due to out of date and possibly abandonded LSF DRMAA implementation when use'ing LSF instead of .combined_LSF_SGE --- .../drmaa/v1_0/LibDrmaaIntegrationTest.java | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java index ac2064640..d98281ad3 100644 --- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java @@ -40,6 +40,7 @@ import java.util.Arrays; import java.util.List; public class LibDrmaaIntegrationTest extends BaseTest { + private String implementation = null; @Test public void testDrmaa() throws Exception { @@ -79,10 +80,24 @@ public class LibDrmaaIntegrationTest extends BaseTest { Assert.fail(String.format("Could not get DRMAA implementation from the DRMAA library: %s", error.getString(0))); System.out.println(String.format("DRMAA implementation(s): %s", drmaaImplementation.getString(0))); + + this.implementation = drmaaImplementation.getString(0); } - @Test + @Test(dependsOnMethods = { "testDrmaa" }) public void testSubmitEcho() throws Exception { + if (implementation.indexOf("LSF") >= 0) { + System.err.println(" *********************************************************"); + System.err.println(" ***********************************************************"); + System.err.println(" **** ****"); + System.err.println(" **** Skipping LibDrmaaIntegrationTest.testSubmitEcho() ****"); + System.err.println(" **** Are you using the dotkit .combined_LSF_SGE? ****"); + System.err.println(" **** ****"); + System.err.println(" ***********************************************************"); + System.err.println(" *********************************************************"); + return; + } + Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER); int errnum; @@ -129,7 +144,7 @@ public class LibDrmaaIntegrationTest extends BaseTest { errnum = LibDrmaa.drmaa_set_vector_attribute(jt, LibDrmaa.DRMAA_V_ARGV, args, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) - Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_REMOTE_COMMAND, error.getString(0))); + Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_V_ARGV, error.getString(0))); errnum = LibDrmaa.drmaa_run_job(jobIdMem, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, jt, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); From 219252a5661d7f00bff5237c040d55d813f9c24c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 23 Aug 2011 15:38:19 -0400 Subject: [PATCH 09/25] Adapting to the new RodBinding framework --- .../sting/queue/qscripts/PacbioProcessingPipeline.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index 9f3dd9a2c..accc5c7f0 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -145,7 +145,7 @@ class RecalibrateBaseQualities extends QScript { } case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.knownSites :+= dbSNP this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam this.recal_file = outRecalFile From cd12f7f2865bd3237e3fe952f664859956aa61d1 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 24 Aug 2011 11:10:40 -0400 Subject: [PATCH 10/25] Fixed list dependency Instead of creating a bam list file, I dynamically create a scala list and pass as parameters. This way the intermediate bam files don't get deleted before they should. --- .../qscripts/DataProcessingPipeline.scala | 76 +++++++++---------- 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 724518142..2417e5620 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -106,7 +106,7 @@ class DataProcessingPipeline extends QScript { // Because the realignment only happens after these scripts are executed, in case you are using // bwa realignment, this function will operate over the original bam files and output over the // (to be realigned) bam files. - def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = { + def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = { // Creating a table with SAMPLE information from each input BAM file val sampleTable = scala.collection.mutable.Map.empty[String, List[File]] @@ -131,24 +131,25 @@ class DataProcessingPipeline extends QScript { sampleTable(sample) :+= rBam } } + return sampleTable.toMap - println("\n\n*** INPUT FILES ***\n") - // Creating one file for each sample in the dataset - val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] - for ((sample, flist) <- sampleTable) { - - println(sample + ":") - for (f <- flist) - println (f) - println() - - val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list") - sampleBamFiles(sample) = sampleFileName - add(writeList(flist, sampleFileName)) - } - println("*** INPUT FILES ***\n\n") - - return sampleBamFiles.toMap +// println("\n\n*** INPUT FILES ***\n") +// // Creating one file for each sample in the dataset +// val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] +// for ((sample, flist) <- sampleTable) { +// +// println(sample + ":") +// for (f <- flist) +// println (f) +// println() +// +// val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list") +// sampleBamFiles(sample) = sampleFileName +// //add(writeList(flist, sampleFileName)) +// } +// println("*** INPUT FILES ***\n\n") +// +// return sampleBamFiles.toMap } // Rebuilds the Read Group string to give BWA @@ -224,7 +225,10 @@ class DataProcessingPipeline extends QScript { def script = { + // final output list of processed bam files + var cohortList: List[File] = List() + // sets the model for the Indel Realigner cleanModelEnum = getIndelCleaningModel() // keep a record of the number of contigs in the first bam file in the list @@ -233,28 +237,19 @@ class DataProcessingPipeline extends QScript { val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)} - // Generate a BAM file per sample joining all per lane files if necessary - val sampleBAMFiles: Map[String, File] = createSampleFiles(bams, realignedBAMs) + // generate a BAM file per sample joining all per lane files if necessary + val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs) - // Final output list of processed bam files - var cohortList: List[File] = List() - - // Simple progress report - println("\nFound the following samples: ") - for ((sample, file) <- sampleBAMFiles) - println("\t" + sample + " -> " + file) - println("\n") - - // If this is a 'knowns only' indel realignment run, do it only once for all samples. + // if this is a 'knowns only' indel realignment run, do it only once for all samples. val globalIntervals = new File(outputDir + projectName + ".intervals") if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY) add(target(null, globalIntervals)) - // Put each sample through the pipeline - for ((sample, sampleFile) <- sampleBAMFiles) { - val bam = if (sampleFile.endsWith(".list")) {swapExt(sampleFile, ".list", ".bam")} else {sampleFile} + // put each sample through the pipeline + for ((sample, bamList) <- sampleBAMFiles) { // BAM files generated by the pipeline + val bam = new File(qscript.projectName + "." + sample + ".bam") val cleanedBam = swapExt(bam, ".bam", ".clean.bam") val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam") val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam") @@ -272,15 +267,16 @@ class DataProcessingPipeline extends QScript { // Validation is an optional step for the BAM file generated after // alignment and the final bam file of the pipeline. - if (!noValidation && sampleFile.endsWith(".bam")) { // todo -- implement validation for .list BAM files + if (!noValidation) { + for (sampleFile <- bamList) add(validate(sampleFile, preValidateLog), validate(recalBam, postValidateLog)) } if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) - add(target(sampleFile, targetIntervals)) + add(target(bamList, targetIntervals)) - add(clean(sampleFile, targetIntervals, cleanedBam), + add(clean(bamList, targetIntervals, cleanedBam), dedup(cleanedBam, dedupedBam, metricsFile), cov(dedupedBam, preRecalFile), recal(dedupedBam, preRecalFile, recalBam), @@ -320,9 +316,9 @@ class DataProcessingPipeline extends QScript { this.maxRecordsInRam = 100000 } - case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { + case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY) - this.input_file :+= inBams + this.input_file = inBams this.out = outIntervals this.mismatchFraction = 0.0 this.known :+= qscript.dbSNP @@ -333,8 +329,8 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outIntervals + ".target" } - case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { - this.input_file :+= inBams + case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { + this.input_file = inBams this.targetIntervals = tIntervals this.out = outBam this.known :+= qscript.dbSNP From e618cb1e7911e5cd12c43c5aaafc03a954f605a9 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 24 Aug 2011 12:25:50 -0400 Subject: [PATCH 11/25] a) Renamed/expanded SelectVariants arguments that choose particular kinds of variants and particular allelic types, now instead of -Indels or -SNPs we can specify for example -selectType [MIXED|INDEL|SNP|MNP|SYMBOLIC]. To select biallelic, multiallelic variants, use -restrictAllelesTo [BIALLELIC|MULTIALLELIC]. Corresponding gatkdocs changes. b) More useful AC,AF logging in VariantsToTable with multiallelic sites: instead of logging comma-separated values, log max value by default. Hidden, experimental argument -logACSum to log sum of ACs instead. This is due to extreme slowness of R in parsing strings to tokens and computing max/sum itself (~100x slower than gatk). c) Added integrationtest for new SelectVariants commands --- .../walkers/variantutils/SelectVariants.java | 78 +++++++++++++++---- .../walkers/variantutils/VariantsToTable.java | 48 +++++++++--- .../SelectVariantsIntegrationTest.java | 13 ++++ 3 files changed, 112 insertions(+), 27 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 428c896e9..bb3cd82a1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.apache.poi.hpsf.Variant; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.utils.MathUtils; @@ -165,6 +166,23 @@ import java.util.*; * -o output.vcf \ * -fraction 0.5 * + * Select only indels from a VCF: + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T SelectVariants \ + * --variant input.vcf \ + * -o output.vcf \ + * -selectType INDEL + * + * Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column): + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T SelectVariants \ + * --variant input.vcf \ + * -o output.vcf \ + * -selectType SNP -selectType MNP \ + * -restrictAllelesTo MULTIALLELIC + * * * */ @@ -223,11 +241,17 @@ public class SelectVariants extends RodWalker { @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) private boolean EXCLUDE_FILTERED = false; - @Argument(fullName="excludeBiallelic", shortName="xl_biallelic", doc="Select only multi-allelic variants, excluding any biallelic one.", required=false) - private boolean EXCLUDE_BIALLELIC = false; - @Argument(fullName="selectMultiallelic", shortName="xl_multiallelic", doc="Select only biallelic variants, excluding any multi-allelic one.", required=false) - private boolean EXCLUDE_MULTIALLELIC = false; + /** + * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf. + * For example, a multiallelic record such as: + * 1 100 . A AAA,AAAAA + * will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as: + * 1 100 . A T + * will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC + */ + @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false) + private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL; @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; @@ -272,11 +296,14 @@ public class SelectVariants extends RodWalker { @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) private double fractionRandom = 0; - @Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false) - private boolean SELECT_SNPS = false; + /** + * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. + * When specified one or more times, a particular type of variant is selected. + * + */ + @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) + private List TYPES_TO_INCLUDE = new ArrayList(); - @Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false) - private boolean SELECT_INDELS = false; @Hidden @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) @@ -296,6 +323,13 @@ public class SelectVariants extends RodWalker { } + public enum NumberAlleleRestriction { + ALL, + BIALLELIC, + MULTIALLELIC + } + + private ArrayList selectedTypes = new ArrayList(); private ArrayList selectNames = new ArrayList(); private List jexls = null; @@ -360,6 +394,20 @@ public class SelectVariants extends RodWalker { for ( String sample : samples ) logger.info("Including sample '" + sample + "'"); + + + // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include + if (TYPES_TO_INCLUDE.isEmpty()) { + + for (VariantContext.Type t : VariantContext.Type.values()) + selectedTypes.add(t); + + } + else { + for (VariantContext.Type t : TYPES_TO_INCLUDE) + selectedTypes.add(t); + + } // Initialize VCF header Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); headerLines.add(new VCFHeaderLine("source", "SelectVariants")); @@ -499,18 +547,14 @@ public class SelectVariants extends RodWalker { if (!isConcordant(vc, compVCs)) return 0; } - if (EXCLUDE_BIALLELIC && vc.isBiallelic()) - return 0; - if (EXCLUDE_MULTIALLELIC && !vc.isBiallelic()) - return 0; - - // TODO - add ability to also select MNPs - // TODO - move variant selection arguments to the engine so other walkers can also do this - if (SELECT_INDELS && !(vc.isIndel() || vc.isMixed())) + if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic()) continue; - if (SELECT_SNPS && !vc.isSNP()) + if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic()) + continue; + + if (!selectedTypes.contains(vc.getType())) continue; VariantContext sub = subsetRecord(vc, samples); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 19db58e0c..7d0b4c3d4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -124,8 +124,12 @@ public class VariantsToTable extends RodWalker { * multi-allelic INFO field values can be lists of values. */ @Advanced - @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) - public boolean keepMultiAllelic = false; + @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) + public boolean keepMultiAllelic = false; + + @Hidden + @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false) + public boolean logACSum = false; /** * By default, this tool throws a UserException when it encounters a field without a value in some record. This @@ -150,7 +154,7 @@ public class VariantsToTable extends RodWalker { if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) { for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) { - List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA); + List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum); out.println(Utils.join("\t", vals)); } } @@ -176,9 +180,11 @@ public class VariantsToTable extends RodWalker { * @param fields a non-null list of fields to capture from VC * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise * provides a value of NA + * @param kma if true, multiallelic variants are to be kept + * @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count. * @return */ - public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { + private static List extractFields(VariantContext vc, List fields, boolean allowMissingData, boolean kma, boolean logsum) { List vals = new ArrayList(); for ( String field : fields ) { @@ -206,12 +212,31 @@ public class VariantsToTable extends RodWalker { } if (field.equals("AF") || field.equals("AC")) { - if (val.contains(",")) { - // strip [,] and spaces - val = val.replace("[",""); - val = val.replace("]",""); - val = val.replace(" ",""); - } + String afo = val; + + double af=0; + if (afo.contains(",")) { + String[] afs = afo.split(","); + afs[0] = afs[0].substring(1,afs[0].length()); + afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); + + double[] afd = new double[afs.length]; + + for (int k=0; k < afd.length; k++) + afd[k] = Double.valueOf(afs[k]); + + if (kma && logsum) + af = MathUtils.sum(afd); + else + af = MathUtils.arrayMax(afd); + //af = Double.valueOf(afs[0]); + + } + else + if (!afo.equals("NA")) + af = Double.valueOf(afo); + + val = Double.toString(af); } vals.add(val); @@ -220,6 +245,9 @@ public class VariantsToTable extends RodWalker { return vals; } + public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { + return extractFields(vc, fields, allowMissingData, false, false); + } // // default reduce -- doesn't do anything at all // diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index cec377f5f..20409d4ca 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -77,6 +77,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testConcordance--" + testFile, spec); } + @Test + public void testVariantTypeSelection() { + String testFile = validationDataLocation + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("e0b12c0b47a8a7a988b3587b47bfa8cf") + ); + + executeTest("testVariantTypeSelection--" + testFile, spec); + } + @Test(enabled=false) public void testRemovePLs() { String testFile = validationDataLocation + "combine.3.vcf"; From e5008aba00a8c0d89a4622b2f66920b0f5484256 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 24 Aug 2011 15:18:44 -0400 Subject: [PATCH 13/25] Output the top two haplotypes as a variant call by running smith-waterman alignment against the reference and calling any difference as variation. This is the first verion that runs end-to-end by taking in reads as bam file and writing out variant calls in VCF. --- .../sting/utils/variantcontext/VariantContext.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 888dc1e98..1fde8879b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -209,6 +209,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati /** * the complete constructor. Makes a complete VariantContext from its arguments + * This is the only constructor that is able to create indels! DO NOT USE THE OTHER ONES. * * @param source source * @param contig the contig @@ -293,7 +294,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati } /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes + * Create a new variant context with genotypes but without Perror, filters, and attributes * * @param source source * @param contig the contig From dc8398e165e6dcf138916249d4ce365375008248 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 24 Aug 2011 15:58:34 -0400 Subject: [PATCH 15/25] fixing bai output for indel cleaning. --- .../sting/queue/qscripts/DataProcessingPipeline.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 47ba0220f..116d16f35 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -313,7 +313,6 @@ class DataProcessingPipeline extends QScript { } case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input_file :+= inBams this.targetIntervals = tIntervals this.out = outBam From e3f5d7067a6da44660132459d97d351f3c9c2185 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 24 Aug 2011 14:07:54 -0400 Subject: [PATCH 16/25] Added ReorderSam queue binding --- .../queue/extensions/picard/ReorderSam.scala | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala new file mode 100644 index 000000000..14b42ed5d --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline._ + +import java.io.File +/* + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 6/22/11 + * Time: 10:35 AM + */ +class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "ReorderSam" + javaMainClass = "net.sf.picard.sam.SortSam" + + @Input(doc="Input file (bam or sam) to extract reads from.", shortName = "input", fullName = "input_bam_files", required = true) + var input: List[File] = Nil + + @Output(doc="Output file (bam or sam) to write extracted reads to.", shortName = "output", fullName = "output_bam_file", required = true) + var output: File = _ + + @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) + var outputIndex: File = _ + + @Argument(doc="Reference sequence to reorder reads to match.", shortName = "input", fullName = "input_bam_files", required = true) + var reference: File = _ + + @Argument(doc="If true, then allows only a partial overlap of the BAM contigs with the new reference sequence contigs. By default, this tool requires a corresponding contig in the new reference for each read contig.", shortName = "aic", fullName = "allow_incomplete_concordance", required = false) + var ALLOW_INCOMPLETE_DICT_CONCORDANCE: Boolean = false + + @Argument(doc="If true, then permits mapping from a read contig to a new reference contig with the same name but a different length. Highly dangerous, only use if you know what you are doing.", shortName = "acld", fullName = "allow_contig_length_discordance", required = false) + var ALLOW_CONTIG_LENGTH_DISCORDANCE: Boolean = false + + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null && output.toString.endsWith(".bam")) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + override def inputBams = input + override def outputBam = output + this.createIndex = Some(outputIndex != null) + this.sortOrder = null + override def commandLine = super.commandLine + + " REFERENCE=" + reference + + conditionalParameter (ALLOW_INCOMPLETE_DICT_CONCORDANCE, " ALLOW_INCOMPLETE_DICT_CONCORDANCE=true") + conditionalParameter (ALLOW_CONTIG_LENGTH_DISCORDANCE, " ALLOW_CONTIG_LENGTH_DISCORDANCE=true") +} \ No newline at end of file From 16caca0822dfb08271f2ed30a31a8ff4c73da60f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 24 Aug 2011 17:02:44 -0400 Subject: [PATCH 17/25] BLASR BAMs and new BWA parameters *Added the functions to turn a BLASR generated BAM file into a usable BAM file. *Modified the bwa parameters according to test results from NA12878 pb2k dataset. --- .../qscripts/PacbioProcessingPipeline.scala | 51 ++++++++++++------- .../queue/extensions/picard/ReorderSam.scala | 20 ++++---- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index accc5c7f0..6947d4398 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -4,9 +4,9 @@ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.util.QScriptUtils import net.sf.samtools.SAMFileHeader.SortOrder -import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups} import org.broadinstitute.sting.utils.exceptions.UserException import org.broadinstitute.sting.commandline.Hidden +import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups} /** * Created by IntelliJ IDEA. @@ -16,7 +16,7 @@ import org.broadinstitute.sting.commandline.Hidden */ -class RecalibrateBaseQualities extends QScript { +class PacbioProcessingPipeline extends QScript { @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true) var input: File = _ @@ -39,13 +39,16 @@ class RecalibrateBaseQualities extends QScript { @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ + @Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false) + var BLASR_BAM: Boolean = false + @Hidden @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) var dbq: Int = 20 - - - + @Hidden + @Input(shortName="bwastring", required=false) + var bwastring: String = "" val queueLogDir: String = ".qlog/" @@ -57,8 +60,6 @@ class RecalibrateBaseQualities extends QScript { var USE_BWA: Boolean = false - println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName) - if (file.endsWith(".fasta") || file.endsWith(".fq")) { if (bwaPath == null) { throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA"); @@ -69,28 +70,34 @@ class RecalibrateBaseQualities extends QScript { // FASTA -> BAM steps val alignedSam: File = file.getName + ".aligned.sam" val sortedBam: File = swapExt(alignedSam, ".sam", ".bam") - val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam") val rgBam: File = swapExt(file, ".bam", ".rg.bam") val bamBase = if (USE_BWA) {rgBam} else {file} // BAM Steps + val mqBAM: File = swapExt(bamBase, ".bam", ".mq.bam") val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv") val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv") val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam") val path1: String = recalBam + ".before" val path2: String = recalBam + ".after" - if (USE_BWA) { add(align(file, alignedSam), sortSam(alignedSam, sortedBam), - addQuals(sortedBam, qualBam, dbq), - addReadGroup(qualBam, rgBam, sample)) + addReadGroup(sortedBam, rgBam, sample)) } - add(cov(bamBase, recalFile1), - recal(bamBase, recalFile1, recalBam), + else if (BLASR_BAM) { + val reorderedBAM = swapExt(bamBase, ".bam", ".reordered.bam") + add(reorder(bamBase, reorderedBAM), + changeMQ(reorderedBAM, mqBAM)) + } + + val bam = if (BLASR_BAM) {mqBAM} else {bamBase} + + add(cov(bam, recalFile1), + recal(bam, recalFile1, recalBam), cov(recalBam, recalFile2), analyzeCovariates(recalFile1, path1), analyzeCovariates(recalFile2, path2)) @@ -110,13 +117,13 @@ class RecalibrateBaseQualities extends QScript { case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs { - def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z10 -t8 " + reference + " " + inFastq + " > " + outSam + def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z20 -t16 " + reference + " " + inFastq + " > " + outSam + this.memoryLimit = 8 this.analysisName = queueLogDir + outSam + ".bwa_sam_se" this.jobName = queueLogDir + outSam + ".bwa_sam_se" } - case class sortSam (@Input inSam: File, @Output outBam: File) extends SortSam with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") + case class sortSam (inSam: File, outBam: File) extends SortSam with ExternalCommonArgs { this.input = List(inSam) this.output = outBam this.sortOrder = SortOrder.coordinate @@ -124,10 +131,16 @@ class RecalibrateBaseQualities extends QScript { this.jobName = queueLogDir + outBam + ".sortSam" } - case class addQuals(inBam: File, outBam: File, qual: Int) extends PrintReads with CommandLineGATKArgs { + case class reorder (inSam: File, outSam: File) extends ReorderSam with ExternalCommonArgs { + this.input = List(inSam) + this.output = outSam + this.sortReference = reference + } + + case class changeMQ(inBam: File, outBam: File) extends PrintReads with CommandLineGATKArgs { this.input_file :+= inBam this.out = outBam - this.DBQ = qual + this.read_filter :+= "ReassignMappingQuality" } case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs { @@ -145,6 +158,7 @@ class RecalibrateBaseQualities extends QScript { } case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { + this.DBQ = dbq this.knownSites :+= dbSNP this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam @@ -155,6 +169,7 @@ class RecalibrateBaseQualities extends QScript { } case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { + this.DBQ = dbq this.input_file :+= inBam this.recal_file = inRecalFile this.out = outBam diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala index 14b42ed5d..72489dc87 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala @@ -11,7 +11,7 @@ import java.io.File */ class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { analysisName = "ReorderSam" - javaMainClass = "net.sf.picard.sam.SortSam" + javaMainClass = "net.sf.picard.sam.ReorderSam" @Input(doc="Input file (bam or sam) to extract reads from.", shortName = "input", fullName = "input_bam_files", required = true) var input: List[File] = Nil @@ -22,27 +22,27 @@ class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLine @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) var outputIndex: File = _ - @Argument(doc="Reference sequence to reorder reads to match.", shortName = "input", fullName = "input_bam_files", required = true) - var reference: File = _ + @Argument(doc="Reference sequence to reorder reads to match.", shortName = "ref", fullName = "sort_reference", required = true) + var sortReference: File = _ @Argument(doc="If true, then allows only a partial overlap of the BAM contigs with the new reference sequence contigs. By default, this tool requires a corresponding contig in the new reference for each read contig.", shortName = "aic", fullName = "allow_incomplete_concordance", required = false) - var ALLOW_INCOMPLETE_DICT_CONCORDANCE: Boolean = false + var ALLOW_INCOMPLETE_DICT_CONCORDANCE: Boolean = _ @Argument(doc="If true, then permits mapping from a read contig to a new reference contig with the same name but a different length. Highly dangerous, only use if you know what you are doing.", shortName = "acld", fullName = "allow_contig_length_discordance", required = false) - var ALLOW_CONTIG_LENGTH_DISCORDANCE: Boolean = false + var ALLOW_CONTIG_LENGTH_DISCORDANCE: Boolean = _ override def freezeFieldValues() { super.freezeFieldValues() - if (outputIndex == null && output != null && output.toString.endsWith(".bam")) + if (outputIndex == null && output != null) outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") } override def inputBams = input override def outputBam = output - this.createIndex = Some(outputIndex != null) + this.createIndex = Some(true) this.sortOrder = null override def commandLine = super.commandLine + - " REFERENCE=" + reference + - conditionalParameter (ALLOW_INCOMPLETE_DICT_CONCORDANCE, " ALLOW_INCOMPLETE_DICT_CONCORDANCE=true") - conditionalParameter (ALLOW_CONTIG_LENGTH_DISCORDANCE, " ALLOW_CONTIG_LENGTH_DISCORDANCE=true") + " REFERENCE=" + sortReference + + optional(" ALLOW_INCOMPLETE_DICT_CONCORDANCE=", ALLOW_INCOMPLETE_DICT_CONCORDANCE) + optional(" ALLOW_CONTIG_LENGTH_DISCORDANCE=", ALLOW_CONTIG_LENGTH_DISCORDANCE) } \ No newline at end of file From 8bbef79fc275be0f250b2eaf8fdf6d2892b985a1 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 25 Aug 2011 15:37:26 -0400 Subject: [PATCH 22/25] Create clipped alleles during allele parsing instead of creating a full VC, clipping alleles, and regenerating the VC from scratch. --- .../utils/codecs/vcf/AbstractVCFCodec.java | 143 +++++------------- .../utils/variantcontext/VariantContext.java | 5 +- .../variantcontext/VariantContextUtils.java | 74 ++++++++- 3 files changed, 111 insertions(+), 111 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index a3100030e..3a4ca2478 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -281,7 +281,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VariantContext vc = null; try { - vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes); + vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]); } catch (Exception e) { generateException(e.getMessage()); } @@ -291,7 +291,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, vc.getGenotypes(); // Trim bases of all alleles if necessary - return createVariantContextWithTrimmedAlleles(vc); + return vc; } /** @@ -516,25 +516,44 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, return true; } - private static int computeForwardClipping(List unclippedAlleles, String ref) { + public static int computeForwardClipping(List unclippedAlleles, String ref) { boolean clipping = true; - // Note that the computation of forward clipping here is meant only to see whether there is a common - // base to all alleles, and to correctly compute reverse clipping, - // but it is not used for actually changing alleles - this is done in function - // createVariantContextWithTrimmedAlleles() below. - for (Allele a : unclippedAlleles) { - if (a.isSymbolic()) { + for ( Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) continue; - } - if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) { + + if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) { clipping = false; + break; } } - return (clipping) ? 1 : 0; + return (clipping) ? 1 : 0; } + protected static int computeReverseClipping(List unclippedAlleles, String ref, int forwardClipping, int lineNo) { + int clipping = 0; + boolean stillClipping = true; + + while ( stillClipping ) { + for ( Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) + stillClipping = false; + else if ( ref.length() == clipping ) + generateException("bad alleles encountered", lineNo); + else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] ) + stillClipping = false; + } + if ( stillClipping ) + clipping++; + } + + return clipping; + } /** * clip the alleles, based on the reference * @@ -542,122 +561,30 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @param ref the reference string * @param unclippedAlleles the list of unclipped alleles * @param clippedAlleles output list of clipped alleles + * @param lineNo the current line number in the file * @return the new reference end position of this event */ protected static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { - // Note that the computation of forward clipping here is meant only to see whether there is a common - // base to all alleles, and to correctly compute reverse clipping, - // but it is not used for actually changing alleles - this is done in function - // createVariantContextWithTrimmedAlleles() below. - int forwardClipping = computeForwardClipping(unclippedAlleles, ref); - - int reverseClipped = 0; - boolean clipping = true; - while (clipping) { - for (Allele a : unclippedAlleles) { - if (a.isSymbolic()) { - continue; - } - if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0) - clipping = false; - else if (ref.length() == reverseClipped) - generateException("bad alleles encountered", lineNo); - else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1]) - clipping = false; - } - if (clipping) reverseClipped++; - } + int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo); if ( clippedAlleles != null ) { for ( Allele a : unclippedAlleles ) { if ( a.isSymbolic() ) { clippedAlleles.add(a); } else { - clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference())); + clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping), a.isReference())); } } } // the new reference length - int refLength = ref.length() - reverseClipped; + int refLength = ref.length() - reverseClipping; return position+Math.max(refLength - 1,0); } - public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { - // see if we need to trim common reference base from all alleles - boolean trimVC; - - // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles - Allele refAllele = inputVC.getReference(); - if (!inputVC.isVariant()) - trimVC = false; - else if (refAllele.isNull()) - trimVC = false; - else { - trimVC = (computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), - inputVC.getReference().getDisplayString()) > 0); - } - - // nothing to do if we don't need to trim bases - if (trimVC) { - List alleles = new ArrayList(); - Map genotypes = new TreeMap(); - - // set the reference base for indels in the attributes - Map attributes = new TreeMap(inputVC.getAttributes()); - - Map originalToTrimmedAlleleMap = new HashMap(); - - for (Allele a : inputVC.getAlleles()) { - if (a.isSymbolic()) { - alleles.add(a); - originalToTrimmedAlleleMap.put(a, a); - } else { - // get bases for current allele and create a new one with trimmed bases - byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length()); - Allele trimmedAllele = Allele.create(newBases, a.isReference()); - alleles.add(trimmedAllele); - originalToTrimmedAlleleMap.put(a, trimmedAllele); - } - } - - // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation - // example: mixed records such as {TA*,TGA,TG} - boolean hasNullAlleles = false; - - for (Allele a: originalToTrimmedAlleleMap.values()) { - if (a.isNull()) - hasNullAlleles = true; - if (a.isReference()) - refAllele = a; - } - - if (!hasNullAlleles) - return inputVC; - // now we can recreate new genotypes with trimmed alleles - for ( Map.Entry sample : inputVC.getGenotypes().entrySet() ) { - - List originalAlleles = sample.getValue().getAlleles(); - List trimmedAlleles = new ArrayList(); - for ( Allele a : originalAlleles ) { - if ( a.isCalled() ) - trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); - else - trimmedAlleles.add(Allele.NO_CALL); - } - genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); - - } - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0])); - - } - - return inputVC; - } - public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) { try { return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 1fde8879b..673fe4529 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -258,9 +258,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param negLog10PError qual * @param filters filters: use null for unfiltered and empty set for passes filters * @param attributes attributes + * @param referenceBaseForIndel padded reference base */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, null, true); + public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { + this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 834ad0917..986d6305c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -657,12 +657,84 @@ public class VariantContextUtils { VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) ); // Trim the padded bases of all alleles if necessary - merged = AbstractVCFCodec.createVariantContextWithTrimmedAlleles(merged); + merged = createVariantContextWithTrimmedAlleles(merged); if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; } + public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { + // see if we need to trim common reference base from all alleles + boolean trimVC; + + // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles + Allele refAllele = inputVC.getReference(); + if (!inputVC.isVariant()) + trimVC = false; + else if (refAllele.isNull()) + trimVC = false; + else { + trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), + inputVC.getReference().getDisplayString()) > 0); + } + + // nothing to do if we don't need to trim bases + if (trimVC) { + List alleles = new ArrayList(); + Map genotypes = new TreeMap(); + + // set the reference base for indels in the attributes + Map attributes = new TreeMap(inputVC.getAttributes()); + + Map originalToTrimmedAlleleMap = new HashMap(); + + for (Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length()); + Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation + // example: mixed records such as {TA*,TGA,TG} + boolean hasNullAlleles = false; + + for (Allele a: originalToTrimmedAlleleMap.values()) { + if (a.isNull()) + hasNullAlleles = true; + if (a.isReference()) + refAllele = a; + } + + if (!hasNullAlleles) + return inputVC; + // now we can recreate new genotypes with trimmed alleles + for ( Map.Entry sample : inputVC.getGenotypes().entrySet() ) { + + List originalAlleles = sample.getValue().getAlleles(); + List trimmedAlleles = new ArrayList(); + for ( Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); + + } + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0])); + + } + + return inputVC; + } + public static Map stripPLs(Map genotypes) { Map newGs = new HashMap(genotypes.size()); From 09a729da3aba44d319c603a93058e902fb7a1aa0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 25 Aug 2011 15:42:52 -0400 Subject: [PATCH 23/25] Removing incorrect comment --- .../broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 3a4ca2478..bb212e128 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -290,7 +290,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, if ( !header.samplesWereAlreadySorted() ) vc.getGenotypes(); - // Trim bases of all alleles if necessary return vc; } From 9b7512fd94f532d18b6793d2c81b03c2a306656c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 25 Aug 2011 22:42:14 -0400 Subject: [PATCH 25/25] Just because there's a ref base doesn't mean the VC needs to be padded --- .../sting/gatk/walkers/variantutils/VariantsToTable.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 7d0b4c3d4..0cf425e6f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -271,7 +271,7 @@ public class VariantsToTable extends RodWalker { getters.put("REF", new Getter() { public String get(VariantContext vc) { String x = ""; - if ( vc.hasReferenceBaseForIndel() ) { + if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) { Byte refByte = vc.getReferenceBaseForIndel(); x=x+new String(new byte[]{refByte}); } @@ -283,7 +283,7 @@ public class VariantsToTable extends RodWalker { StringBuilder x = new StringBuilder(); int n = vc.getAlternateAlleles().size(); if ( n == 0 ) return "."; - if ( vc.hasReferenceBaseForIndel() ) { + if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) { Byte refByte = vc.getReferenceBaseForIndel(); x.append(new String(new byte[]{refByte})); }