From 5d19fca6490db8e07a1634074d30094576019859 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Tue, 11 Sep 2012 23:01:00 -0400 Subject: [PATCH 02/17] A couple of bug-fixy changes. 1) SelectVariants could throw a ReviewedStingException (one of the nasty "Bug:") ones if the user requested a sample that wasn't present in the VCF. The walker now checks for this in the initialize() phase, and throws a more informative error if the situation is detected. If the user simply wants to subset the VCF to all the samples requested that are actually present in the VCF, the --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES flag changes this UserException to a Warning, and does the appropriate subsetting. Added integration tests for this. 2) GenotypeLikelihoods has an unsafe method getLog10GQ(GenotypeType), which is completely broken for multi-allelic sites. I marked that method as deprecated, and added methods that use the context of the allele ordering (either directly specified or as a VC) to retrieve the appropriate GQ, and added a unit test to cover this case. VariantsToBinaryPed needs to dynamically calculate the GQ field sometimes (because I have some VCFs with PLs but no GQ). --- .../walkers/variantutils/SelectVariants.java | 29 +++++++++++++++++-- .../variantutils/VariantsToBinaryPed.java | 15 +++++++++- .../variantcontext/GenotypeLikelihoods.java | 27 +++++++++++++++++ .../SelectVariantsIntegrationTest.java | 29 +++++++++++++++++++ .../GenotypeLikelihoodsUnitTest.java | 26 +++++++++++++++++ 5 files changed, 122 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 3d14308b6..7bad19775 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; @@ -325,6 +326,9 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(doc="indel size select",required=false,fullName="maxIndelSize") private int maxIndelSize = Integer.MAX_VALUE; + @Argument(doc="Allow a samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES") + private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false; + /* Private class used to store the intermediate variants in the integer random selection process */ private static class RandomVariantStructure { @@ -386,10 +390,29 @@ public class SelectVariants extends RodWalker implements TreeR Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); - // first, add any requested samples - samples.addAll(samplesFromFile); - samples.addAll(samplesFromExpressions); + // first, check overlap between requested and present samples + Set commandLineUniqueSamples = new HashSet(samplesFromFile.size()+samplesFromExpressions.size()+sampleNames.size()); + commandLineUniqueSamples.addAll(samplesFromFile); + commandLineUniqueSamples.addAll(samplesFromExpressions); + commandLineUniqueSamples.addAll(sampleNames); + commandLineUniqueSamples.removeAll(vcfSamples); + if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) { + logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored."); + samplesFromFile.removeAll(commandLineUniqueSamples); + samplesFromExpressions.retainAll(commandLineUniqueSamples); + } else if (commandLineUniqueSamples.size() > 0 ) { + throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s", + "Samples entered on command line (through -sf or -sn) that are not present in the VCF.", + "A list of these samples:", + Utils.join(",",commandLineUniqueSamples), + "To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")); + } + + // second, add the requested samples samples.addAll(sampleNames); + samples.addAll(samplesFromExpressions); + samples.addAll(samplesFromFile); + samples.removeAll(commandLineUniqueSamples); // if none were requested, we want all of them if ( samples.isEmpty() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 2e6a80462..6bc6153df 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -7,7 +7,9 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -15,6 +17,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -278,7 +281,7 @@ public class VariantsToBinaryPed extends RodWalker { private byte getFlippedEncoding(Genotype g, int offset) { byte b; - if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) { + if ( ! checkGQIsGood(g) ) { b = NO_CALL; } else if ( g.isHomRef() ) { b = HOM_VAR; @@ -293,6 +296,16 @@ public class VariantsToBinaryPed extends RodWalker { return (byte) (b << (2*offset)); } + private boolean checkGQIsGood(Genotype genotype) { + if ( genotype.hasGQ() ) { + return genotype.getGQ() >= minGenotypeQuality; + } else if ( genotype.hasLikelihoods() ) { + return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality; + } + + return false; + } + private static String getID(VariantContext v) { if ( v.hasID() ) { return v.getID(); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 7b4256b70..641eb5449 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.EnumMap; +import java.util.List; public class GenotypeLikelihoods { private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5; @@ -167,10 +168,36 @@ public class GenotypeLikelihoods { //Return the neg log10 Genotype Quality (GQ) for the given genotype //Returns Double.NEGATIVE_INFINITY in case of missing genotype + + /** + * This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context. + * Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List) in place of it. + * + * If you **know** you're biallelic, use getGQLog10FromLikelihoods directly. + * @param genotype - actually a genotype type (no call, hom ref, het, hom var) + * @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best). + */ + @Deprecated public double getLog10GQ(GenotypeType genotype){ return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector()); } + @Requires({"genotypeAlleles != null","genotypeAlleles.size()==2","contextAlleles != null","contextAlleles.size() >= 1"}) + private double getLog10GQ(List genotypeAlleles,List contextAlleles) { + int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0)); + int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1)); + int plIndex = calculatePLindex(allele1Index,allele2Index); + return getGQLog10FromLikelihoods(plIndex,getAsVector()); + } + + public double getLog10GQ(Genotype genotype, List vcAlleles ) { + return getLog10GQ(genotype.getAlleles(),vcAlleles); + } + + public double getLog10GQ(Genotype genotype, VariantContext context) { + return getLog10GQ(genotype,context.getAlleles()); + } + public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){ if(likelihoods == null) return Double.NEGATIVE_INFINITY; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 77e29f87b..ffd9c9b4a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -70,6 +70,20 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testComplexSelection--" + testfile, spec); } + @Test + public void testComplexSelectionWithNonExistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES -sn A -se '[CDH]' -sn Z -sn T -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), + 1, + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") + ); + spec.disableShadowBCF(); + executeTest("testComplexSelectionWithNonExistingSamples--" + testfile, spec); + } + @Test public void testNonExistingFieldSelection() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -98,6 +112,21 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testSampleExclusion--" + testfile, spec); } + @Test + public void testSampleInclusionWithNonexistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -sn A -sn Z -sn Q -sf " + samplesFile + " --variant " + testfile, + 1, + UserException.BadInput.class + ); + spec.disableShadowBCF(); + + executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); + } + @Test public void testConcordance() { diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 69f42e1f9..4ce32cee7 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -29,12 +29,15 @@ package org.broadinstitute.sting.utils.variantcontext; // the imports for unit testing. +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.Test; +import java.util.Arrays; import java.util.EnumMap; +import java.util.List; /** @@ -44,6 +47,7 @@ public class GenotypeLikelihoodsUnitTest { double [] v = new double[]{-10.5, -1.25, -5.11}; final static String vGLString = "-10.50,-1.25,-5.11"; final static String vPLString = "93,0,39"; + double[] triAllelic = new double[]{-4.2,-2.0,-3.0,-1.6,0.0,-4.0}; //AA,AB,AC,BB,BC,CC @Test public void testFromVector2() { @@ -139,6 +143,28 @@ public class GenotypeLikelihoodsUnitTest { } } + // this test is completely broken, the method is wrong. + public void testGetQualFromLikelihoodsMultiAllelicBroken() { + GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); + double actualGQ = gl.getLog10GQ(GenotypeType.HET); + double expectedGQ = 1.6; + Assert.assertEquals(actualGQ,expectedGQ); + } + + public void testGetQualFromLikelihoodsMultiAllelic() { + GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); + Allele ref = Allele.create(BaseUtils.A,true); + Allele alt1 = Allele.create(BaseUtils.C); + Allele alt2 = Allele.create(BaseUtils.T); + List allAlleles = Arrays.asList(ref,alt1,alt2); + List gtAlleles = Arrays.asList(alt1,alt2); + GenotypeBuilder gtBuilder = new GenotypeBuilder(); + gtBuilder.alleles(gtAlleles); + double actualGQ = gl.getLog10GQ(gtBuilder.make(),allAlleles); + double expectedGQ = 1.6; + Assert.assertEquals(actualGQ,expectedGQ); + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { From 91f320453491c3981cb980b617d352121a5b8705 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 11 Sep 2012 16:52:54 -0400 Subject: [PATCH 07/17] VCF/BCF writers once again automatically write out no-call genotypes for samples in the VCFHeader but not in the VC itself -- Turns out this was consuming 30% of the UG runtime, and causing problems elsewhere. -- Removed addMissingSamples from VariantcontextUtils, and calls to it -- Updated VCF / BCF writers to automatically write out a diploid no call for missing samples -- Added unit tests for this behavior in VariantContextWritersUnitTest --- .../genotyper/UnifiedGenotyperEngine.java | 19 +------- .../walkers/variantutils/CombineVariants.java | 2 +- .../walkers/variantutils/VariantsToVCF.java | 1 - .../utils/variantcontext/GenotypeBuilder.java | 13 ++++++ .../variantcontext/VariantContextUtils.java | 28 +----------- .../variantcontext/writer/BCF2Writer.java | 9 +++- .../variantcontext/writer/VCFWriter.java | 12 +---- .../VariantContextTestProvider.java | 45 +++++++++++++++++++ .../writer/VariantContextWritersUnitTest.java | 10 +++++ 9 files changed, 79 insertions(+), 60 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index a57c877e0..469d63b8a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -198,23 +197,7 @@ public class UnifiedGenotyperEngine { } } - return addMissingSamples(results, allSamples); - } - - private List addMissingSamples(final List calls, final Set allSamples) { - if ( calls.isEmpty() || allSamples == null ) return calls; - - final List withAllSamples = new ArrayList(calls.size()); - for ( final VariantCallContext call : calls ) { - if ( call == null ) - withAllSamples.add(null); - else { - final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples); - withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit)); - } - } - - return withAllSamples; + return results; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 555999bdb..b1d8dc91d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -313,7 +313,7 @@ public class CombineVariants extends RodWalker implements Tree VariantContextUtils.calculateChromosomeCounts(builder, false); if ( minimalVCF ) VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples)); + vcfWriter.add(builder.make()); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 78c9c4a1c..5f80f77a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -246,7 +246,6 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); - vc = VariantContextUtils.addMissingSamples(vc, samples); vcfwriter.add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index 0ee32fa2e..9337a78f9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -53,6 +53,8 @@ import java.util.*; */ @Invariant({"alleles != null"}) public final class GenotypeBuilder { + private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + private String sampleName = null; private List alleles = Collections.emptyList(); @@ -90,6 +92,17 @@ public final class GenotypeBuilder { return new GenotypeBuilder(sampleName, alleles).PL(gls).make(); } + /** + * Create a new Genotype object for a sample that's missing from the VC (i.e., in + * the output header). Defaults to a diploid no call genotype ./. + * + * @param sampleName the name of this sample + * @return an initialized Genotype with sampleName that's a diploid ./. no call genotype + */ + public static Genotype createMissing(final String sampleName) { + return new GenotypeBuilder(sampleName).alleles(DIPLOID_NO_CALL).make(); + } + /** * Create a empty builder. Both a sampleName and alleles must be provided * before trying to make a Genotype from this builder. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index d7e4a7135..8abcf115a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -32,8 +32,8 @@ import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -47,7 +47,6 @@ public class VariantContextUtils { public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; - private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); final public static JexlEngine engine = new JexlEngine(); @@ -60,31 +59,6 @@ public class VariantContextUtils { engine.setDebug(false); } - /** - * Ensures that VC contains all of the samples in allSamples by adding missing samples to - * the resulting VC with default diploid ./. genotypes - * - * @param vc the VariantContext - * @param allSamples all of the samples needed - * @return a new VariantContext with missing samples added - */ - public static VariantContext addMissingSamples(final VariantContext vc, final Set allSamples) { - // TODO -- what's the fastest way to do this calculation? - final Set missingSamples = new HashSet(allSamples); - missingSamples.removeAll(vc.getSampleNames()); - - if ( missingSamples.isEmpty() ) - return vc; - else { - //logger.warn("Adding " + missingSamples.size() + " missing samples to called context"); - final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes()); - for ( final String missing : missingSamples ) { - gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make()); - } - return new VariantContextBuilder(vc).genotypes(gc).make(); - } - } - /** * Update the attributes of the attributes map given the VariantContext to reflect the * proper chromosome-based VCF tags diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index e4c64b26b..a338c7c0d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -32,7 +32,10 @@ import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; import org.broadinstitute.sting.utils.codecs.bcf2.BCFVersion; -import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFContigHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -345,10 +348,12 @@ class BCF2Writer extends IndexingVariantContextWriter { final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); + assert writer != null; + writer.start(encoder, vc); for ( final String name : sampleNames ) { Genotype g = vc.getGenotype(name); - if ( g == null ) VCFWriter.missingSampleError(vc, header); + if ( g == null ) g = GenotypeBuilder.createMissing(name); writer.addGenotype(encoder, vc, g); } writer.done(encoder, vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index db74f2263..93b3b603f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -343,9 +342,7 @@ class VCFWriter extends IndexingVariantContextWriter { mWriter.write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); - if ( g == null ) { - missingSampleError(vc, mHeader); - } + if ( g == null ) g = GenotypeBuilder.createMissing(sample); final List attrs = new ArrayList(genotypeFormatKeys.size()); for ( String field : genotypeFormatKeys ) { @@ -426,13 +423,6 @@ class VCFWriter extends IndexingVariantContextWriter { } } - public static final void missingSampleError(final VariantContext vc, final VCFHeader header) { - final List badSampleNames = new ArrayList(); - for ( final String x : header.getGenotypeSamples() ) - if ( ! vc.hasGenotype(x) ) badSampleNames.add(x); - throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames)); - } - private boolean isMissingValue(String s) { // we need to deal with the case that it's a list of missing values return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length()); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 26e2dbfbc..6785fa816 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -596,6 +596,51 @@ public class VariantContextTestProvider { return TEST_DATAs; } + public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { + final int nSamples = data.header.getNGenotypeSamples(); + if ( nSamples > 2 ) { + for ( final VariantContext vc : data.vcs ) + if ( vc.isSymbolic() ) + // cannot handle symbolic alleles because they may be weird non-call VCFs + return; + + final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); + tmpFile.deleteOnExit(); + + // write expected to disk + final EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); + final VariantContextWriter writer = tester.makeWriter(tmpFile, options); + + final Set samplesInVCF = new HashSet(data.header.getGenotypeSamples()); + final List missingSamples = Arrays.asList("MISSING1", "MISSING2"); + final List allSamples = new ArrayList(missingSamples); + allSamples.addAll(samplesInVCF); + + final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples); + writeVCsToFile(writer, header, data.vcs); + + // ensure writing of expected == actual + final Pair> p = readAllVCs(tmpFile, tester.makeCodec()); + final Iterable actual = p.getSecond(); + + int i = 0; + for ( final VariantContext readVC : actual ) { + if ( readVC == null ) continue; // sometimes we read null records... + final VariantContext expected = data.vcs.get(i++); + for ( final Genotype g : readVC.getGenotypes() ) { + Assert.assertTrue(allSamples.contains(g.getSampleName())); + if ( samplesInVCF.contains(g.getSampleName()) ) { + assertEquals(g, expected.getGenotype(g.getSampleName())); + } else { + // missing + Assert.assertTrue(g.isNoCall()); + } + } + } + + } + } + public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { testReaderWriter(tester, data.header, data.vcs, data.vcs, true); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java index 1b791bf6c..adf3eb235 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java @@ -82,6 +82,11 @@ public class VariantContextWritersUnitTest extends BaseTest { VariantContextTestProvider.testReaderWriter(new BCFIOTester(), testData); } + @Test(dataProvider = "VariantContextTest_SingleContexts") + public void testBCF2WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { + VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new BCFIOTester(), testData); + } + private class BCFIOTester extends VariantContextTestProvider.VariantContextIOTest { @Override public String getExtension() { @@ -110,6 +115,11 @@ public class VariantContextWritersUnitTest extends BaseTest { VariantContextTestProvider.testReaderWriter(new VCFIOTester(), testData); } + @Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts") + public void testVCF4WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { + VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new VCFIOTester(), testData); + } + private class VCFIOTester extends VariantContextTestProvider.VariantContextIOTest { @Override public String getExtension() { From d1ba17df5dfb2294f17873e918674627aec30a77 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 12 Sep 2012 06:41:36 -0400 Subject: [PATCH 08/17] Fixed nasty bug in BCF2 writer for case where all genotypes are missing -- Previous code was looking for a -1 result from maxPloidy() but the result as actually 0, so instead of writing a diploid no call we were actually writing "unavailable" genotypes, and failing the BCF == VCF test in integration tests. Fixed. --- .../sting/utils/variantcontext/GenotypesContext.java | 6 +++++- .../sting/utils/variantcontext/writer/BCF2FieldWriter.java | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index ba8668fa9..02ea1a1f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.variantcontext; import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import java.util.*; @@ -413,6 +412,11 @@ public class GenotypesContext implements List { return getGenotypes().get(i); } + /** + * What is the max ploidy among all samples? Returns 0 if no genotypes are present + * + * @return + */ @Ensures("result >= 0") public int getMaxPloidy() { if ( maxPloidy == -1 ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java index 5b81e7117..497c68c0c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java @@ -275,7 +275,7 @@ public abstract class BCF2FieldWriter { nValuesPerGenotype = vc.getMaxPloidy(); // deal with the case where we have no call everywhere, in which case we write out diploid - if ( nValuesPerGenotype == -1 ) + if ( nValuesPerGenotype == 0 ) nValuesPerGenotype = 2; super.start(encoder, vc); From bfbf1686cd0f71c94dea59c84b6c74c71f0ae1af Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 12 Sep 2012 07:08:03 -0400 Subject: [PATCH 09/17] Fixed nasty bug with defaulting to diploid no-call genotypes -- For the pooled caller we were writing diploid no-calls even when other samples were haploid. Changed maxPloidy function to return a defaultPloidy, rather than 0, in the case where all samples are missing. -- VCF/BCF Writers now create missing genotypes with the ploidy of other samples, or 2 if none are available at all. -- Updating integration tests for general ploidy, as previously we wrote ./. even when other calls were 0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/1/1/1/1/1, but now we write ./././././././././././././././././././././././. (ugly but correct) --- .../UnifiedGenotyperGeneralPloidyIntegrationTest.java | 6 +++--- .../sting/utils/codecs/vcf/VCFCompoundHeaderLine.java | 4 ++-- .../sting/utils/variantcontext/GenotypeBuilder.java | 11 +++++++++-- .../sting/utils/variantcontext/GenotypesContext.java | 11 +++++++++-- .../sting/utils/variantcontext/VariantContext.java | 9 +++++---- .../utils/variantcontext/writer/BCF2FieldWriter.java | 6 +----- .../sting/utils/variantcontext/writer/BCF2Writer.java | 2 +- .../sting/utils/variantcontext/writer/VCFWriter.java | 4 +++- 8 files changed, 33 insertions(+), 20 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index e0bf07809..a4a618887 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -1,9 +1,9 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; import java.util.Arrays; -import org.testng.annotations.Test; /** * Created by IntelliJ IDEA. @@ -52,7 +52,7 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","90af837f372e3d5143af30bf5c8c2b75"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","567ae6b2a7f839b1307d4087c2f59cca"); } @Test(enabled = true) @@ -62,7 +62,7 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","26598044436c8044f22ffa767b06a0f0"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","d2a22e12f1969ae199557947e5039b58"); } @Test(enabled = true) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 667de3dea..5273806a7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -88,8 +88,8 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF case UNBOUNDED: return -1; case A: return vc.getNAlleles() - 1; case G: - final int ploidy = vc.getMaxPloidy(); - return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy == 0 ? 2 : ploidy); + final int ploidy = vc.getMaxPloidy(2); + return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy); default: throw new ReviewedStingException("Unknown count type: " + countType); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index 9337a78f9..8fd792d3b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -53,6 +53,7 @@ import java.util.*; */ @Invariant({"alleles != null"}) public final class GenotypeBuilder { + private static final List HAPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL); private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); private String sampleName = null; @@ -99,8 +100,14 @@ public final class GenotypeBuilder { * @param sampleName the name of this sample * @return an initialized Genotype with sampleName that's a diploid ./. no call genotype */ - public static Genotype createMissing(final String sampleName) { - return new GenotypeBuilder(sampleName).alleles(DIPLOID_NO_CALL).make(); + public static Genotype createMissing(final String sampleName, final int ploidy) { + final GenotypeBuilder builder = new GenotypeBuilder(sampleName); + switch ( ploidy ) { + case 1: builder.alleles(HAPLOID_NO_CALL); break; + case 2: builder.alleles(DIPLOID_NO_CALL); break; + default: builder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL)); break; + } + return builder.make(); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index 02ea1a1f2..f306bac4d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -413,18 +413,25 @@ public class GenotypesContext implements List { } /** - * What is the max ploidy among all samples? Returns 0 if no genotypes are present + * What is the max ploidy among all samples? Returns defaultPloidy if no genotypes are present * + * @param defaultPloidy the default ploidy, if all samples are no-called * @return */ @Ensures("result >= 0") - public int getMaxPloidy() { + public int getMaxPloidy(final int defaultPloidy) { + if ( defaultPloidy < 0 ) throw new IllegalArgumentException("defaultPloidy must be greater than or equal to 0"); + if ( maxPloidy == -1 ) { maxPloidy = 0; // necessary in the case where there are no genotypes for ( final Genotype g : getGenotypes() ) { maxPloidy = Math.max(g.getPloidy(), maxPloidy); } + + // everything is no called so we return the default ploidy + if ( maxPloidy == 0 ) maxPloidy = defaultPloidy; } + return maxPloidy; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index dd16cf7e1..abac84202 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -642,14 +642,15 @@ public class VariantContext implements Feature { // to enable tribble integratio } /** - * Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes + * Returns the maximum ploidy of all samples in this VC, or default if there are no genotypes * * This function is caching, so it's only expensive on the first call * - * @return -1, or the max ploidy + * @param defaultPloidy the default ploidy, if all samples are no-called + * @return default, or the max ploidy */ - public int getMaxPloidy() { - return genotypes.getMaxPloidy(); + public int getMaxPloidy(final int defaultPloidy) { + return genotypes.getMaxPloidy(defaultPloidy); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java index 497c68c0c..61c0129bb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java @@ -272,11 +272,7 @@ public abstract class BCF2FieldWriter { encodingType = BCF2Type.INT8; buildAlleleMap(vc); - nValuesPerGenotype = vc.getMaxPloidy(); - - // deal with the case where we have no call everywhere, in which case we write out diploid - if ( nValuesPerGenotype == 0 ) - nValuesPerGenotype = 2; + nValuesPerGenotype = vc.getMaxPloidy(2); super.start(encoder, vc); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index a338c7c0d..536f07f90 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -353,7 +353,7 @@ class BCF2Writer extends IndexingVariantContextWriter { writer.start(encoder, vc); for ( final String name : sampleNames ) { Genotype g = vc.getGenotype(name); - if ( g == null ) g = GenotypeBuilder.createMissing(name); + if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype); writer.addGenotype(encoder, vc, g); } writer.done(encoder, vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 93b3b603f..f5306b6da 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -338,11 +338,13 @@ class VCFWriter extends IndexingVariantContextWriter { */ private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) throws IOException { + final int ploidy = vc.getMaxPloidy(2); + for ( String sample : mHeader.getGenotypeSamples() ) { mWriter.write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); - if ( g == null ) g = GenotypeBuilder.createMissing(sample); + if ( g == null ) g = GenotypeBuilder.createMissing(sample, ploidy); final List attrs = new ArrayList(genotypeFormatKeys.size()); for ( String field : genotypeFormatKeys ) { From 96be1cbea9eefb1d6bd5797c7979aae84a26d04c Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Wed, 12 Sep 2012 10:11:06 -0400 Subject: [PATCH 10/17] My own integration test isn't passing with a clean checkout. This fix to the walker ought to do it. --- .../walkers/variantutils/SelectVariants.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 7bad19775..9664a5bde 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -396,10 +396,17 @@ public class SelectVariants extends RodWalker implements TreeR commandLineUniqueSamples.addAll(samplesFromExpressions); commandLineUniqueSamples.addAll(sampleNames); commandLineUniqueSamples.removeAll(vcfSamples); + + // second, add the requested samples + samples.addAll(sampleNames); + samples.addAll(samplesFromExpressions); + samples.addAll(samplesFromFile); + + logger.debug(Utils.join(",",commandLineUniqueSamples)); + if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) { logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored."); - samplesFromFile.removeAll(commandLineUniqueSamples); - samplesFromExpressions.retainAll(commandLineUniqueSamples); + samples.removeAll(commandLineUniqueSamples); } else if (commandLineUniqueSamples.size() > 0 ) { throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s", "Samples entered on command line (through -sf or -sn) that are not present in the VCF.", @@ -408,11 +415,6 @@ public class SelectVariants extends RodWalker implements TreeR "To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")); } - // second, add the requested samples - samples.addAll(sampleNames); - samples.addAll(samplesFromExpressions); - samples.addAll(samplesFromFile); - samples.removeAll(commandLineUniqueSamples); // if none were requested, we want all of them if ( samples.isEmpty() ) { From 994a4ff387fa5054e747e1449c75e75d00689845 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 12 Sep 2012 11:24:53 -0400 Subject: [PATCH 11/17] Track all outputs from BQSR (.table, .csv., and .pdf) as @Output arguments. Updated integration tests because we no longer have command-line options not to generate plots (now just don't provide a pdf) or to keep the intermediate csv (now, just provide a filename on the command-line). This is currently busted because we can't access the original filenames from the Engine's storage/stub system and therefore cannot call out to the Rscript with the executor (which requires filename strings). --- .../walkers/bqsr/BQSRIntegrationTest.java | 33 +++++----- .../sting/gatk/walkers/bqsr/BQSRGatherer.java | 12 ++-- .../gatk/walkers/bqsr/BaseRecalibrator.java | 18 ++---- .../bqsr/RecalibrationArgumentCollection.java | 30 ++++++---- .../sting/utils/recalibration/RecalUtils.java | 60 ++++++++----------- .../recalibration/RecalibrationReport.java | 6 -- 6 files changed, 67 insertions(+), 92 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 85615962c..58ce7ffef 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -34,7 +34,6 @@ public class BQSRIntegrationTest extends WalkerTest { " -I " + bam + " -L " + interval + args + - " --no_plots" + " -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) + " -o %s"; } @@ -50,21 +49,21 @@ public class BQSRIntegrationTest extends WalkerTest { String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqInterval = "chr1:10,000,000-10,100,000"; return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "1cfc73371abb933ca26496745d105ff0")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "ee5142776008741b1b2453b1258c6d99")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "fbc520794f0f98d52159de956f7217f1")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab5b93794049c514bf8e407019d76b67")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "81df636e3d0ed6f16113517e0169bc96")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "ad3c47355448f8c45e172c6e1129c65d")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "fef7240140a9b6d6335ce009fa4edec5")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "600652ee49b9ce1ca2d8ee2d8b7c8211")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "769f95b9dcc78a405d3e6b191e5a19f5")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "43fcba51264cc98bd8466d21e1b96766")}, - {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "48aaf9ac54b97eac3663882a59354ab2")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "dac04b9e1e1c52af8d3a50c2e550fda9")}, - {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "90d70542076715a8605a8d4002614b34")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "600652ee49b9ce1ca2d8ee2d8b7c8211")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "26a04f5a28c40750c603cbe8a926d7bd")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "5a28b9fb5f2e36703e9804d276c38009")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "646a7c6db12cf0ec119bc27abed9c7b8")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "777f21676435837ba470497e17624266")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f7d77e0d86d033c69f25ef9858fdb95d")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "c3866646833cbb60831695d016d614d1")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "04c1d020bdb25fc55c3983748702290c")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "edf77f41cdd6c27f987cb1ecbcaa889b")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "3d52db844e8220d2dbdcd1339b3d3000")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "47605edafb4da0859bf735a6bd2dfe9c")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0ac92d3548fdca8f253121842bb38c65")}, + {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "de7448f5bf787c17f1ee4c415bc90d3c")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "60542fe8a3cc89a47421767c6e1c11cd")}, + {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "f9a5a8f1b8f77f4c8857ccba8bff49a6")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "3d52db844e8220d2dbdcd1339b3d3000")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "919d88b173b0c11cbca762132bc94ab9")}, }; } @@ -88,7 +87,6 @@ public class BQSRIntegrationTest extends WalkerTest { " -R " + b36KGReference + " -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" + " -L 1:10,000,000-10,200,000" + - " --no_plots" + " -o %s", 1, // just one output file UserException.CommandLineException.class); @@ -102,7 +100,6 @@ public class BQSRIntegrationTest extends WalkerTest { " -R " + b36KGReference + " -I " + privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam" + " -L 1:50,000-80,000" + - " --no_plots" + " -o %s", 1, // just one output file UserException.class); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index a6d82d5b3..128b3f809 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -49,7 +49,6 @@ public class BQSRGatherer extends Gatherer { @Override public void gather(List inputs, File output) { - RecalibrationReport generalReport = null; final PrintStream outputFile; try { outputFile = new PrintStream(output); @@ -57,6 +56,7 @@ public class BQSRGatherer extends Gatherer { throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); } + RecalibrationReport generalReport = null; for (File input : inputs) { final RecalibrationReport inputReport = new RecalibrationReport(input); if (generalReport == null) @@ -70,14 +70,12 @@ public class BQSRGatherer extends Gatherer { generalReport.calculateQuantizedQualities(); RecalibrationArgumentCollection RAC = generalReport.getRAC(); - if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) { - final File recal_out = new File(output.getName() + ".original"); + if (RAC.recalibrationReport != null && RAC.RECAL_PDF != null) { final RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); - RecalUtils.generateRecalibrationPlot(recal_out, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates()); } - else if (!RAC.NO_PLOTS) { - final File recal_out = new File(output.getName() + ".recal"); - RecalUtils.generateRecalibrationPlot(recal_out, generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); + else if (RAC.RECAL_PDF != null) { + RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates()); } generalReport.output(outputFile); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 43aa85a05..04ebeed55 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -50,8 +50,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; import java.lang.reflect.Constructor; import java.util.ArrayList; @@ -110,6 +108,7 @@ import java.util.ArrayList; @Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality @PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta public class BaseRecalibrator extends LocusWalker implements TreeReducible, NanoSchedulable { + @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates @@ -284,7 +283,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed generateReport(); logger.info("...done!"); - if (!RAC.NO_PLOTS) { + if (RAC.RECAL_PDF != null) { logger.info("Generating recalibration plots..."); generatePlots(); } @@ -296,10 +295,10 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE; if (recalFile != null) { RecalibrationReport report = new RecalibrationReport(recalFile); - RecalUtils.generateRecalibrationPlot(RAC.RECAL_FILE, report.getRecalibrationTables(), recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), recalibrationTables, requestedCovariates); } else - RecalUtils.generateRecalibrationPlot(RAC.RECAL_FILE, recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(RAC, recalibrationTables, requestedCovariates); } @@ -313,14 +312,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed } private void generateReport() { - PrintStream output; - try { - output = new PrintStream(RAC.RECAL_FILE); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_FILE, "could not be created"); - } - - RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates, output); + RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index f4b00925e..e230817ec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -28,10 +28,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.recalibration.RecalUtils; import java.io.File; +import java.io.PrintStream; import java.util.Collections; import java.util.List; @@ -62,8 +62,22 @@ public class RecalibrationArgumentCollection { * and the raw empirical quality score calculated by phred-scaling the mismatch rate. */ @Gather(BQSRGatherer.class) - @Output - public File RECAL_FILE; + @Output(doc = "The output recalibration table file to create", required = true) + public PrintStream RECAL_TABLE; + + /** + * If not provided, then no plots will be generated (useful for queue scatter/gathering). + * However, we *highly* recommend that users generate these plots whenever possible for QC checking. + */ + @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false) + public PrintStream RECAL_PDF = null; + + /** + * If not provided, then a temporary file is created and then deleted upon completion. + */ + @Hidden + @Output(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false) + public PrintStream RECAL_CSV = null; /** * List all implemented covariates. @@ -166,12 +180,6 @@ public class RecalibrationArgumentCollection { @Hidden @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; - @Hidden - @Argument(fullName = "keep_intermediate_files", shortName = "k", required = false, doc ="does not remove the temporary csv file created to generate the plots") - public boolean KEEP_INTERMEDIATE_FILES = false; - @Hidden - @Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering") - public boolean NO_PLOTS = false; public File recalibrationReport = null; @@ -205,10 +213,6 @@ public class RecalibrationArgumentCollection { argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); argumentsTable.addRowID("quantizing_levels", true); argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); - argumentsTable.addRowID("keep_intermediate_files", true); - argumentsTable.set("keep_intermediate_files", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES); - argumentsTable.addRowID("no_plots", true); - argumentsTable.set("no_plots", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS); argumentsTable.addRowID("recalibration_report", true); argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath()); argumentsTable.addRowID("binary_tag_name", true); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 20aabdb83..980ca715b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -47,7 +47,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; -import java.io.FileNotFoundException; +import java.io.IOException; import java.io.PrintStream; import java.util.*; @@ -333,8 +333,8 @@ public class RecalUtils { return covariate.getClass().getSimpleName().split("Covariate")[0]; } - public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) { - outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile); + public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates) { + outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), RAC.RECAL_TABLE); } /** @@ -362,46 +362,36 @@ public class RecalUtils { report.print(outputFile); } - private static Pair initializeRecalibrationPlot(File filename) { - final PrintStream deltaTableStream; - final File deltaTableFileName = new File(filename + ".csv"); - try { - deltaTableStream = new PrintStream(deltaTableFileName); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(deltaTableFileName, "File " + deltaTableFileName + " could not be created"); - } - return new Pair(deltaTableStream, deltaTableFileName); - } - - private static void outputRecalibrationPlot(final File gatkReportFilename, Pair files, boolean keepIntermediates) { - final File csvFileName = files.getSecond(); - final File plotFileName = new File(csvFileName + ".pdf"); - files.getFirst().close(); + private static void outputRecalibrationPlot(final RecalibrationArgumentCollection RAC) { final RScriptExecutor executor = new RScriptExecutor(); executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - executor.addArgs(csvFileName.getAbsolutePath()); - executor.addArgs(gatkReportFilename.getAbsolutePath()); - executor.addArgs(plotFileName.getAbsolutePath()); + //executor.addArgs(RAC.RECAL_CSV.getAbsolutePath()); + //executor.addArgs(RAC.RECAL_TABLE.getAbsolutePath()); + //executor.addArgs(RAC.RECAL_PDF.getAbsolutePath()); executor.exec(); - - if (!keepIntermediates) - if (!csvFileName.delete()) - throw new ReviewedStingException("Could not find file " + csvFileName.getAbsolutePath()); - } - public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final Covariate[] requestedCovariates, final boolean keepIntermediates) { - final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, true); - outputRecalibrationPlot(filename, files, keepIntermediates); + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) { + generateRecalibrationPlot(RAC, original, null, requestedCovariates); } - public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates, final boolean keepIntermediates) { - final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", requestedCovariates, true); - writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, false); - outputRecalibrationPlot(filename, files, keepIntermediates); + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { + File temporaryFile = null; + if ( RAC.RECAL_CSV == null ) { + try { + temporaryFile = File.createTempFile("BQSR", ".csv"); + temporaryFile.deleteOnExit(); + RAC.RECAL_CSV = new PrintStream(temporaryFile); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(temporaryFile, "Temporary csv file " + temporaryFile + " could not be created because " + e.getMessage()); + } + } + + if ( recalibrated != null ) + writeCSV(RAC.RECAL_CSV, recalibrated, "RECALIBRATED", requestedCovariates, true); + writeCSV(RAC.RECAL_CSV, original, "ORIGINAL", requestedCovariates, recalibrated == null); + outputRecalibrationPlot(RAC); } private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 271c07649..b22956b4a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -284,12 +284,6 @@ public class RecalibrationReport { else if (argument.equals("quantizing_levels")) RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); - else if (argument.equals("keep_intermediate_files")) - RAC.KEEP_INTERMEDIATE_FILES = Boolean.parseBoolean((String) value); - - else if (argument.equals("no_plots")) - RAC.NO_PLOTS = Boolean.parseBoolean((String) value); - else if (argument.equals("recalibration_report")) RAC.recalibrationReport = (value == null) ? null : new File((String) value); From 4bb7a99f087df6aee2fea21c0d472f31b703f26f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 12 Sep 2012 11:51:44 -0400 Subject: [PATCH 12/17] Given that all classes implementing output stubs already have getters for the underlying OutputStream and File, it makes sense to unify that functionality into the Stub interface. Now it is possible to have an Engine utility method that iterates over all registered stubs to find the one representing a given OutputStream and return the File associated with it. --- .../sting/gatk/GenomeAnalysisEngine.java | 16 ++++++++++++++++ .../gatk/io/storage/SAMFileWriterStorage.java | 8 ++++---- .../io/storage/VariantContextWriterStorage.java | 6 +++--- .../sting/gatk/io/stubs/SAMFileWriterStub.java | 6 +++--- .../broadinstitute/sting/gatk/io/stubs/Stub.java | 13 +++++++++++++ .../gatk/io/stubs/VariantContextWriterStub.java | 8 ++++---- 6 files changed, 43 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 516ea8451..bc37b0557 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -63,6 +63,7 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import java.io.File; +import java.io.OutputStream; import java.util.*; /** @@ -731,6 +732,21 @@ public class GenomeAnalysisEngine { outputs.add(stub); } + /** + * Iterates over all registered output stubs and tries to find the one representing the given OutputStream. + * + * @param output the stream to check for + * @return the file associated with the given stream/stub if available, null otherwise + */ + public File getFilenameFromAssociatedOutputStream(final OutputStream output) { + for ( final Stub stub : outputs ) { + if ( stub.getOutputStream() == output ) + return stub.getOutputFile(); + } + + return null; + } + /** * Returns the tag associated with a given command-line argument. * @param key Object for which to inspect the tag. diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java index 300e801e6..9f69a4144 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java @@ -50,7 +50,7 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage, StingSAMFileWrite * Retrieves the SAM file to (ultimately) be created. * @return The SAM file. Must not be null. */ - public File getSAMFile() { + public File getOutputFile() { return samFile; } @@ -162,7 +162,7 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite simplifyBAM = v; } - public OutputStream getSAMOutputStream() { + public OutputStream getOutputStream() { return samOutputStream; } @@ -220,7 +220,7 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite /** * Gets whether to generate an md5 on-the-fly for this BAM. - * @return True generates the md5. False means skip writing the file. + * @param generateMD5 True generates the md5. False means skip writing the file. */ public void setGenerateMD5(boolean generateMD5) { if(writeStarted) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java index b042144b6..873f5b7c8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java @@ -27,6 +27,9 @@ package org.broadinstitute.sting.gatk.io.stubs; import org.broadinstitute.sting.gatk.io.OutputTracker; +import java.io.File; +import java.io.OutputStream; + /** * A stub used for managing IO. Acts as a proxy for IO streams * not yet created or streams that need significant external @@ -43,4 +46,14 @@ public interface Stub { * @param outputTracker The connector used to provide an appropriate stream. */ public void register( OutputTracker outputTracker ); + + /** + * Returns the OutputStream represented by this stub or null if not available. + */ + public OutputStream getOutputStream(); + + /** + * Returns the File represented by this stub or null if not available. + */ + public File getOutputFile(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java index ee1dc63e6..f92d78bb5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java @@ -140,7 +140,7 @@ public class VariantContextWriterStub implements Stub, Var * Retrieves the file to (ultimately) be created. * @return The file. Can be null if genotypeStream is not. */ - public File getFile() { + public File getOutputFile() { return genotypeFile; } @@ -148,7 +148,7 @@ public class VariantContextWriterStub implements Stub, Var * Retrieves the output stearm to which to (ultimately) write. * @return The file. Can be null if genotypeFile is not. */ - public PrintStream getOutputStream() { + public OutputStream getOutputStream() { return genotypeStream; } @@ -196,7 +196,7 @@ public class VariantContextWriterStub implements Stub, Var if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY); - if ( forceBCF || (getFile() != null && VariantContextWriterFactory.isBCFOutput(getFile())) ) + if ( forceBCF || (getOutputFile() != null && VariantContextWriterFactory.isBCFOutput(getOutputFile())) ) options.add(Options.FORCE_BCF); return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); @@ -271,7 +271,7 @@ public class VariantContextWriterStub implements Stub, Var public boolean alsoWriteBCFForTest() { return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded ! isCompressed() && // for non-compressed outputs - getFile() != null && // that are going to disk + getOutputFile() != null && // that are going to disk engine.getArguments().generateShadowBCF; // and we actually want to do it } From 849a2b883950abc25d8390aecc69de7a5d44c167 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 12 Sep 2012 12:23:00 -0400 Subject: [PATCH 13/17] Adding HC integration test for _structural_ insertions and deletions. --- .../haplotypecaller/HaplotypeCallerIntegrationTest.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index b45c027a7..b4ac2b86d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -73,4 +73,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8d092b25f40456e618eef91fdce8adf0")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } + + @Test + public void HCTestStructuralIndels() { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c29e61810c056b52a47baae0696931ea")); + executeTest("HCTestStructuralIndels: ", spec); + } + } From d94d0d15c2e92568df7e20b6c3caf87aa20ff815 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 12 Sep 2012 15:15:40 -0400 Subject: [PATCH 15/17] Complete overhaul of previous commits to make it all work with scatter-gather. Now tracks output files correctly and can print to stdout. --- .../walkers/bqsr/BQSRIntegrationTest.java | 30 +++++++++---------- .../sting/gatk/GenomeAnalysisEngine.java | 15 ---------- .../sting/gatk/walkers/bqsr/BQSRGatherer.java | 15 ++++++---- .../gatk/walkers/bqsr/BaseRecalibrator.java | 12 ++++++-- .../bqsr/RecalibrationArgumentCollection.java | 15 ++++++---- .../sting/utils/recalibration/RecalUtils.java | 26 ++++++++-------- .../recalibration/RecalibrationReport.java | 5 +++- 7 files changed, 60 insertions(+), 58 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 58ce7ffef..b0e9ef4fe 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -49,21 +49,21 @@ public class BQSRIntegrationTest extends WalkerTest { String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqInterval = "chr1:10,000,000-10,100,000"; return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "5a28b9fb5f2e36703e9804d276c38009")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "646a7c6db12cf0ec119bc27abed9c7b8")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "777f21676435837ba470497e17624266")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f7d77e0d86d033c69f25ef9858fdb95d")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "c3866646833cbb60831695d016d614d1")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "04c1d020bdb25fc55c3983748702290c")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "edf77f41cdd6c27f987cb1ecbcaa889b")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "3d52db844e8220d2dbdcd1339b3d3000")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "47605edafb4da0859bf735a6bd2dfe9c")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0ac92d3548fdca8f253121842bb38c65")}, - {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "de7448f5bf787c17f1ee4c415bc90d3c")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "60542fe8a3cc89a47421767c6e1c11cd")}, - {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "f9a5a8f1b8f77f4c8857ccba8bff49a6")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "3d52db844e8220d2dbdcd1339b3d3000")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "919d88b173b0c11cbca762132bc94ab9")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "be6c7bc0b79a2d0395d21cd0154540d5")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "65781095beb41d8feca26e93e04dcc0b")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "8ee1fed1713daca1f36e8b30bee2cd23")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "9449d8a8baac742f46673e9b8314220b")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "39313c6e3b85142548fee9b6c130e7b6")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "15eae9e834ed80b24660393c6df87f85")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "8485d8fd5e780e98d720dfbf79f26528")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "c423d1d443822dae404239bb9a746b96")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "fb0a6aef430f562ed5e0002d03e0c619")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "efee7bcb89abe36da1cfd8a635d37cd2")}, + {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "0e8a3238902a1ff0f0c657fb09b4c022")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "5e58d3dcf5ca38f008a64d1c0743ed83")}, + {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "1a8e5c85c7935eb1bd2203f5c86ce1db")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "c423d1d443822dae404239bb9a746b96")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "6762b39dc027056365280a9d582a6713")}, }; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index bc37b0557..fc2546173 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -732,21 +732,6 @@ public class GenomeAnalysisEngine { outputs.add(stub); } - /** - * Iterates over all registered output stubs and tries to find the one representing the given OutputStream. - * - * @param output the stream to check for - * @return the file associated with the given stream/stub if available, null otherwise - */ - public File getFilenameFromAssociatedOutputStream(final OutputStream output) { - for ( final Stub stub : outputs ) { - if ( stub.getOutputStream() == output ) - return stub.getOutputFile(); - } - - return null; - } - /** * Returns the tag associated with a given command-line argument. * @param key Object for which to inspect the tag. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index 128b3f809..dbb628135 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -70,12 +70,15 @@ public class BQSRGatherer extends Gatherer { generalReport.calculateQuantizedQualities(); RecalibrationArgumentCollection RAC = generalReport.getRAC(); - if (RAC.recalibrationReport != null && RAC.RECAL_PDF != null) { - final RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); - RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates()); - } - else if (RAC.RECAL_PDF != null) { - RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates()); + if ( RAC.RECAL_PDF_FILE != null ) { + RAC.RECAL_TABLE_FILE = output; + if ( RAC.existingRecalibrationReport != null ) { + final RecalibrationReport originalReport = new RecalibrationReport(RAC.existingRecalibrationReport); + RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates()); + } + else { + RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates()); + } } generalReport.output(outputFile); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 04ebeed55..e78b9b6fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -50,6 +50,8 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; +import java.io.IOException; +import java.io.PrintStream; import java.lang.reflect.Constructor; import java.util.ArrayList; @@ -149,7 +151,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed RecalUtils.listAvailableCovariates(logger); System.exit(0); } - RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table + RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates ArrayList requiredCovariates = covariates.getFirst(); @@ -168,6 +170,12 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection } + try { + RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e); + } + int numReadGroups = 0; for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) numReadGroups += header.getReadGroups().size(); @@ -283,7 +291,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed generateReport(); logger.info("...done!"); - if (RAC.RECAL_PDF != null) { + if (RAC.RECAL_PDF_FILE != null) { logger.info("Generating recalibration plots..."); generatePlots(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index e230817ec..d08239b96 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -59,10 +59,11 @@ public class RecalibrationArgumentCollection { * After the header, data records occur one per line until the end of the file. The first several items on a line are the * values of the individual covariates and will change depending on which covariates were specified at runtime. The last * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, - * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out. */ @Gather(BQSRGatherer.class) @Output(doc = "The output recalibration table file to create", required = true) + public File RECAL_TABLE_FILE = null; public PrintStream RECAL_TABLE; /** @@ -70,14 +71,14 @@ public class RecalibrationArgumentCollection { * However, we *highly* recommend that users generate these plots whenever possible for QC checking. */ @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false) - public PrintStream RECAL_PDF = null; + public File RECAL_PDF_FILE = null; /** * If not provided, then a temporary file is created and then deleted upon completion. */ @Hidden - @Output(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false) - public PrintStream RECAL_CSV = null; + @Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false) + public File RECAL_CSV_FILE = null; /** * List all implemented covariates. @@ -181,7 +182,7 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; - public File recalibrationReport = null; + public File existingRecalibrationReport = null; public GATKReportTable generateReportTable(final String covariateNames) { GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); @@ -214,7 +215,9 @@ public class RecalibrationArgumentCollection { argumentsTable.addRowID("quantizing_levels", true); argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); argumentsTable.addRowID("recalibration_report", true); - argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath()); + argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath()); + argumentsTable.addRowID("plot_pdf_file", true); + argumentsTable.set("plot_pdf_file", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RECAL_PDF_FILE == null ? "null" : RECAL_PDF_FILE.getAbsolutePath()); argumentsTable.addRowID("binary_tag_name", true); argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); return argumentsTable; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 980ca715b..ca490789f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -366,9 +366,9 @@ public class RecalUtils { final RScriptExecutor executor = new RScriptExecutor(); executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - //executor.addArgs(RAC.RECAL_CSV.getAbsolutePath()); - //executor.addArgs(RAC.RECAL_TABLE.getAbsolutePath()); - //executor.addArgs(RAC.RECAL_PDF.getAbsolutePath()); + executor.addArgs(RAC.RECAL_CSV_FILE.getAbsolutePath()); + executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath()); + executor.addArgs(RAC.RECAL_PDF_FILE.getAbsolutePath()); executor.exec(); } @@ -377,20 +377,20 @@ public class RecalUtils { } public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { - File temporaryFile = null; - if ( RAC.RECAL_CSV == null ) { - try { - temporaryFile = File.createTempFile("BQSR", ".csv"); - temporaryFile.deleteOnExit(); - RAC.RECAL_CSV = new PrintStream(temporaryFile); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(temporaryFile, "Temporary csv file " + temporaryFile + " could not be created because " + e.getMessage()); + final PrintStream csvFile; + try { + if ( RAC.RECAL_CSV_FILE == null ) { + RAC.RECAL_CSV_FILE = File.createTempFile("BQSR", ".csv"); + RAC.RECAL_CSV_FILE.deleteOnExit(); } + csvFile = new PrintStream(RAC.RECAL_CSV_FILE); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_CSV_FILE, e); } if ( recalibrated != null ) - writeCSV(RAC.RECAL_CSV, recalibrated, "RECALIBRATED", requestedCovariates, true); - writeCSV(RAC.RECAL_CSV, original, "ORIGINAL", requestedCovariates, recalibrated == null); + writeCSV(csvFile, recalibrated, "RECALIBRATED", requestedCovariates, true); + writeCSV(csvFile, original, "ORIGINAL", requestedCovariates, recalibrated == null); outputRecalibrationPlot(RAC); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index b22956b4a..41b07832c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -285,7 +285,10 @@ public class RecalibrationReport { RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); else if (argument.equals("recalibration_report")) - RAC.recalibrationReport = (value == null) ? null : new File((String) value); + RAC.existingRecalibrationReport = (value == null) ? null : new File((String) value); + + else if (argument.equals("plot_pdf_file")) + RAC.RECAL_PDF_FILE = (value == null) ? null : new File((String) value); else if (argument.equals("binary_tag_name")) RAC.BINARY_TAG_NAME = (value == null) ? null : (String) value; From 86be50f18dc56cf60288065b7e6895e080b45f69 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 14 Sep 2012 10:58:44 -0400 Subject: [PATCH 17/17] Add note to docs that the --list argument requires full command-line --- .../sting/gatk/walkers/annotator/VariantAnnotator.java | 3 +++ .../gatk/walkers/bqsr/RecalibrationArgumentCollection.java | 2 +- .../sting/gatk/walkers/varianteval/VariantEval.java | 4 +++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index cce106210..c4de9ed45 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -161,6 +161,9 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations (not for the faint of heart)", required=false) protected Boolean USE_ALL_ANNOTATIONS = false; + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. + */ @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") protected Boolean LIST = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index d08239b96..f1f0ce38e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -81,7 +81,7 @@ public class RecalibrationArgumentCollection { public File RECAL_CSV_FILE = null; /** - * List all implemented covariates. + * Note that the --list argument requires a fully resolved and correct command-line to work. */ @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) public boolean LIST_ONLY = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index 58cd14737..6971be807 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -126,7 +126,9 @@ public class VariantEval extends RodWalker implements TreeRedu @Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false) public RodBinding goldStandard = null; - // Help arguments + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. + */ @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false) protected Boolean LIST = false;