From 91f320453491c3981cb980b617d352121a5b8705 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 11 Sep 2012 16:52:54 -0400 Subject: [PATCH] VCF/BCF writers once again automatically write out no-call genotypes for samples in the VCFHeader but not in the VC itself -- Turns out this was consuming 30% of the UG runtime, and causing problems elsewhere. -- Removed addMissingSamples from VariantcontextUtils, and calls to it -- Updated VCF / BCF writers to automatically write out a diploid no call for missing samples -- Added unit tests for this behavior in VariantContextWritersUnitTest --- .../genotyper/UnifiedGenotyperEngine.java | 19 +------- .../walkers/variantutils/CombineVariants.java | 2 +- .../walkers/variantutils/VariantsToVCF.java | 1 - .../utils/variantcontext/GenotypeBuilder.java | 13 ++++++ .../variantcontext/VariantContextUtils.java | 28 +----------- .../variantcontext/writer/BCF2Writer.java | 9 +++- .../variantcontext/writer/VCFWriter.java | 12 +---- .../VariantContextTestProvider.java | 45 +++++++++++++++++++ .../writer/VariantContextWritersUnitTest.java | 10 +++++ 9 files changed, 79 insertions(+), 60 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index a57c877e0..469d63b8a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -198,23 +197,7 @@ public class UnifiedGenotyperEngine { } } - return addMissingSamples(results, allSamples); - } - - private List addMissingSamples(final List calls, final Set allSamples) { - if ( calls.isEmpty() || allSamples == null ) return calls; - - final List withAllSamples = new ArrayList(calls.size()); - for ( final VariantCallContext call : calls ) { - if ( call == null ) - withAllSamples.add(null); - else { - final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples); - withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit)); - } - } - - return withAllSamples; + return results; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 555999bdb..b1d8dc91d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -313,7 +313,7 @@ public class CombineVariants extends RodWalker implements Tree VariantContextUtils.calculateChromosomeCounts(builder, false); if ( minimalVCF ) VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples)); + vcfWriter.add(builder.make()); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 78c9c4a1c..5f80f77a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -246,7 +246,6 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); - vc = VariantContextUtils.addMissingSamples(vc, samples); vcfwriter.add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index 0ee32fa2e..9337a78f9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -53,6 +53,8 @@ import java.util.*; */ @Invariant({"alleles != null"}) public final class GenotypeBuilder { + private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + private String sampleName = null; private List alleles = Collections.emptyList(); @@ -90,6 +92,17 @@ public final class GenotypeBuilder { return new GenotypeBuilder(sampleName, alleles).PL(gls).make(); } + /** + * Create a new Genotype object for a sample that's missing from the VC (i.e., in + * the output header). Defaults to a diploid no call genotype ./. + * + * @param sampleName the name of this sample + * @return an initialized Genotype with sampleName that's a diploid ./. no call genotype + */ + public static Genotype createMissing(final String sampleName) { + return new GenotypeBuilder(sampleName).alleles(DIPLOID_NO_CALL).make(); + } + /** * Create a empty builder. Both a sampleName and alleles must be provided * before trying to make a Genotype from this builder. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index d7e4a7135..8abcf115a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -32,8 +32,8 @@ import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -47,7 +47,6 @@ public class VariantContextUtils { public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; - private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); final public static JexlEngine engine = new JexlEngine(); @@ -60,31 +59,6 @@ public class VariantContextUtils { engine.setDebug(false); } - /** - * Ensures that VC contains all of the samples in allSamples by adding missing samples to - * the resulting VC with default diploid ./. genotypes - * - * @param vc the VariantContext - * @param allSamples all of the samples needed - * @return a new VariantContext with missing samples added - */ - public static VariantContext addMissingSamples(final VariantContext vc, final Set allSamples) { - // TODO -- what's the fastest way to do this calculation? - final Set missingSamples = new HashSet(allSamples); - missingSamples.removeAll(vc.getSampleNames()); - - if ( missingSamples.isEmpty() ) - return vc; - else { - //logger.warn("Adding " + missingSamples.size() + " missing samples to called context"); - final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes()); - for ( final String missing : missingSamples ) { - gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make()); - } - return new VariantContextBuilder(vc).genotypes(gc).make(); - } - } - /** * Update the attributes of the attributes map given the VariantContext to reflect the * proper chromosome-based VCF tags diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index e4c64b26b..a338c7c0d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -32,7 +32,10 @@ import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; import org.broadinstitute.sting.utils.codecs.bcf2.BCFVersion; -import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFContigHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -345,10 +348,12 @@ class BCF2Writer extends IndexingVariantContextWriter { final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); + assert writer != null; + writer.start(encoder, vc); for ( final String name : sampleNames ) { Genotype g = vc.getGenotype(name); - if ( g == null ) VCFWriter.missingSampleError(vc, header); + if ( g == null ) g = GenotypeBuilder.createMissing(name); writer.addGenotype(encoder, vc, g); } writer.done(encoder, vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index db74f2263..93b3b603f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -343,9 +342,7 @@ class VCFWriter extends IndexingVariantContextWriter { mWriter.write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); - if ( g == null ) { - missingSampleError(vc, mHeader); - } + if ( g == null ) g = GenotypeBuilder.createMissing(sample); final List attrs = new ArrayList(genotypeFormatKeys.size()); for ( String field : genotypeFormatKeys ) { @@ -426,13 +423,6 @@ class VCFWriter extends IndexingVariantContextWriter { } } - public static final void missingSampleError(final VariantContext vc, final VCFHeader header) { - final List badSampleNames = new ArrayList(); - for ( final String x : header.getGenotypeSamples() ) - if ( ! vc.hasGenotype(x) ) badSampleNames.add(x); - throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames)); - } - private boolean isMissingValue(String s) { // we need to deal with the case that it's a list of missing values return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length()); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 26e2dbfbc..6785fa816 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -596,6 +596,51 @@ public class VariantContextTestProvider { return TEST_DATAs; } + public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { + final int nSamples = data.header.getNGenotypeSamples(); + if ( nSamples > 2 ) { + for ( final VariantContext vc : data.vcs ) + if ( vc.isSymbolic() ) + // cannot handle symbolic alleles because they may be weird non-call VCFs + return; + + final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); + tmpFile.deleteOnExit(); + + // write expected to disk + final EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); + final VariantContextWriter writer = tester.makeWriter(tmpFile, options); + + final Set samplesInVCF = new HashSet(data.header.getGenotypeSamples()); + final List missingSamples = Arrays.asList("MISSING1", "MISSING2"); + final List allSamples = new ArrayList(missingSamples); + allSamples.addAll(samplesInVCF); + + final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples); + writeVCsToFile(writer, header, data.vcs); + + // ensure writing of expected == actual + final Pair> p = readAllVCs(tmpFile, tester.makeCodec()); + final Iterable actual = p.getSecond(); + + int i = 0; + for ( final VariantContext readVC : actual ) { + if ( readVC == null ) continue; // sometimes we read null records... + final VariantContext expected = data.vcs.get(i++); + for ( final Genotype g : readVC.getGenotypes() ) { + Assert.assertTrue(allSamples.contains(g.getSampleName())); + if ( samplesInVCF.contains(g.getSampleName()) ) { + assertEquals(g, expected.getGenotype(g.getSampleName())); + } else { + // missing + Assert.assertTrue(g.isNoCall()); + } + } + } + + } + } + public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { testReaderWriter(tester, data.header, data.vcs, data.vcs, true); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java index 1b791bf6c..adf3eb235 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java @@ -82,6 +82,11 @@ public class VariantContextWritersUnitTest extends BaseTest { VariantContextTestProvider.testReaderWriter(new BCFIOTester(), testData); } + @Test(dataProvider = "VariantContextTest_SingleContexts") + public void testBCF2WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { + VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new BCFIOTester(), testData); + } + private class BCFIOTester extends VariantContextTestProvider.VariantContextIOTest { @Override public String getExtension() { @@ -110,6 +115,11 @@ public class VariantContextWritersUnitTest extends BaseTest { VariantContextTestProvider.testReaderWriter(new VCFIOTester(), testData); } + @Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts") + public void testVCF4WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { + VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new VCFIOTester(), testData); + } + private class VCFIOTester extends VariantContextTestProvider.VariantContextIOTest { @Override public String getExtension() {