VCFWriter and BCFWriter no longer allow missing samples in the VC compared to their header

-- They now throw an error, as its really unsafe to write out ./. as a special case in the VCFWriter as occurred previously. -- Added convenience method in VariantContextUtils.addMissingSamples(vc, allSamples) that returns a complete VC where samples are given ./. Genotype objects -- This allows us to properly pass tests of creating / writing / reading VCFs and BCFs, which previously differed because the VC from the VCF would actually be different from its original VC -- Updated UG, UGEngine, GenotypeAndValidateWalker, CombineVariants, and VariantsToVCF to manage the master list of samples they are writing out and addMissingSamples via the VCU function
2012-06-27 17:39:00 -04:00 · 2012-06-27 17:39:00 -04:00 · 7144154f53
parent 4811a00891
commit 7144154f53
8 changed files with 103 additions and 35 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@ -187,6 +187,8 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
    // the annotation engine
    private VariantAnnotatorEngine annotationEngine;

+    private Set<String> samples;
+
    // enable deletions in the pileup
    @Override
    public boolean includeReadsWithDeletionAtLoci() { return true; }
@ -231,7 +233,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
            logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
        
        // get all of the unique sample names
-        Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
+        samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());

        // initialize the verbose writer
        if ( verboseWriter != null )
@ -298,7 +300,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
     * @return the VariantCallContext object
     */
    public List<VariantCallContext> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
-        return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext);
+        return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, samples);
    }

    public UGStatistics reduceInit() { return new UGStatistics(); }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -140,14 +140,39 @@ public class UnifiedGenotyperEngine {
    }

    /**
-     * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
+     * @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set)
     *
-     * @param tracker    the meta data tracker
-     * @param refContext the reference base
-     * @param rawContext contextual information around the locus
-     * @return the VariantCallContext object
+     * same as the full call but with allSamples == null
+     *
+     * @param tracker
+     * @param refContext
+     * @param rawContext
+     * @return
     */
-    public List<VariantCallContext> calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
+    public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
+                                                                     final ReferenceContext refContext,
+                                                                     final AlignmentContext rawContext) {
+        return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null);
+    }
+
+
+        /**
+        * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
+        *
+        * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype
+        * for every sample in allSamples.  If it's null there's no such guarentee.  Providing this
+        * argument is critical when the resulting calls will be written to a VCF file.
+        *
+        * @param tracker    the meta data tracker
+        * @param refContext the reference base
+        * @param rawContext contextual information around the locus
+        * @param allSamples set of all sample names that we might call (i.e., those in the VCF header)
+        * @return the VariantCallContext object
+        */
+    public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
+                                                                     final ReferenceContext refContext,
+                                                                     final AlignmentContext rawContext,
+                                                                     final Set<String> allSamples) {
        final List<VariantCallContext> results = new ArrayList<VariantCallContext>(2);

        final List<GenotypeLikelihoodsCalculationModel.Model> models = getGLModelsToUse(tracker, refContext, rawContext);
@ -168,7 +193,23 @@ public class UnifiedGenotyperEngine {
            }        
        }

-        return results;
+        return addMissingSamples(results, allSamples);
+    }
+
+    private List<VariantCallContext> addMissingSamples(final List<VariantCallContext> calls, final Set<String> allSamples) {
+        if ( calls.isEmpty() || allSamples == null ) return calls;
+
+        final List<VariantCallContext> withAllSamples = new ArrayList<VariantCallContext>(calls.size());
+        for ( final VariantCallContext call : calls ) {
+            if ( call == null )
+                withAllSamples.add(call);
+            else {
+                final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples);
+                withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit));
+            }
+        }
+
+        return withAllSamples;
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
@ -261,6 +261,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk

    private UnifiedGenotyperEngine snpEngine;
    private UnifiedGenotyperEngine indelEngine;
+    private Set<String> samples;

    public static class CountedData {
        private long nAltCalledAlt = 0L;
@ -307,7 +308,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
        // Initialize VCF header
        if (vcfWriter != null) {
            Map<String, VCFHeader> header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName());
-            Set<String> samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
+            samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
            Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(header.values(), logger);
            headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate"));
            vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
@ -174,17 +174,24 @@ public class CombineVariants extends RodWalker<Integer, Integer> {

    /** Optimization to strip out genotypes before merging if we are doing a sites_only output */
    private boolean sitesOnlyVCF = false;
+    private Set<String> samples;

    public void initialize() {
        Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit());

+        if ( vcfWriter instanceof VariantContextWriterStub) {
+            sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES);
+            if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
+        } else
+            logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
+
        if ( PRIORITY_STRING == null ) {
            PRIORITY_STRING = Utils.join(",", vcfRods.keySet());
            logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING);
        }

        validateAnnotateUnionArguments();
-        Set<String> samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
+        samples = sitesOnlyVCF ? Collections.<String>emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption);

        if ( SET_KEY.toLowerCase().equals("null") )
            SET_KEY = null;
@ -194,15 +201,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
            headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
        if ( !ASSUME_IDENTICAL_SAMPLES )
             headerLines.addAll(Arrays.asList(ChromosomeCounts.descriptions));
-        VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.<String>emptySet() : samples);
+        VCFHeader vcfHeader = new VCFHeader(headerLines, samples);
        vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER);
        vcfWriter.writeHeader(vcfHeader);
-
-        if ( vcfWriter instanceof VariantContextWriterStub) {
-            sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES);
-            if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
-        } else
-            logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
    }

    private void validateAnnotateUnionArguments() {
@ -296,7 +297,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
            VariantContextUtils.calculateChromosomeCounts(builder, false);
            if ( minimalVCF )
                VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY));
-            vcfWriter.add(builder.make());
+            vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples));
        }

        return vcs.isEmpty() ? 0 : 1;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
@ -108,6 +108,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {

    private Set<String> allowedGenotypeFormatStrings = new HashSet<String>();
    private boolean wroteHeader = false;
+    private Set<String> samples;

    // for dealing with indels in hapmap
    CloseableIterator<GATKFeature> dbsnpIterator = null;
@ -228,7 +229,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
                }
            }

-            Set<String> samples = new LinkedHashSet<String>();
+            samples = new LinkedHashSet<String>();
            if ( sampleName != null ) {
                samples.add(sampleName);
            } else {
@ -252,6 +253,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
        }

        vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
+        vc = VariantContextUtils.addMissingSamples(vc, samples);
        vcfwriter.add(vc);
    }

--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@ -46,6 +46,7 @@ public class VariantContextUtils {
    public final static String MERGE_FILTER_IN_ALL = "FilteredInAll";
    public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
    public final static String MERGE_FILTER_PREFIX = "filterIn";
+    private static final List<Allele> DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);

    final public static JexlEngine engine = new JexlEngine();
    public static final int DEFAULT_PLOIDY = 2;
@ -57,6 +58,31 @@ public class VariantContextUtils {
        engine.setDebug(false);
    }

+    /**
+     * Ensures that VC contains all of the samples in allSamples by adding missing samples to
+     * the resulting VC with default diploid ./. genotypes
+     *
+     * @param vc
+     * @param allSamples
+     * @return
+     */
+    public static VariantContext addMissingSamples(final VariantContext vc, final Set<String> allSamples) {
+        // TODO -- what's the fastest way to do this calculation?
+        final Set<String> missingSamples = new HashSet<String>(allSamples);
+        missingSamples.removeAll(vc.getSampleNames());
+
+        if ( missingSamples.isEmpty() )
+            return vc;
+        else {
+            //logger.warn("Adding " + missingSamples.size() + " missing samples to called context");
+            final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes());
+            for ( final String missing : missingSamples ) {
+                gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make());
+            }
+            return new VariantContextBuilder(vc).genotypes(gc).make();
+        }
+    }
+
    /**
     * Update the attributes of the attributes map given the VariantContext to reflect the
     * proper chromosome-based VCF tags
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
@ -302,9 +302,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
                writer.start(encoder, vc);
                for ( final String name : sampleNames ) {
                    Genotype g = vc.getGenotype(name);
-                    if ( g == null )
-                        // we don't have any data about g at all
-                        g = new GenotypeBuilder(name).alleles(MISSING_GENOTYPE).make();
+                    if ( g == null ) VCFWriter.missingSampleError(vc, header);
                    writer.addGenotype(encoder, vc, g);
                }
                writer.done(encoder, vc);
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java
@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
 import net.sf.samtools.SAMSequenceDictionary;
 import org.broad.tribble.TribbleException;
 import org.broad.tribble.util.ParsingUtils;
+import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
@ -339,23 +340,12 @@ class VCFWriter extends IndexingVariantContextWriter {
     */
    private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
    throws IOException {
-//        if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) {
-//            final List<String> badSampleNames = new ArrayList<String>();
-//            for ( final Genotype g : vc.getGenotypes() )
-//                if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) )
-//                    badSampleNames.add(g.getSampleName());
-//            throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames));
-//        }
-
        for ( String sample : mHeader.getGenotypeSamples() ) {
            mWriter.write(VCFConstants.FIELD_SEPARATOR);

            Genotype g = vc.getGenotype(sample);
            if ( g == null ) {
-                // TODO -- The VariantContext needs to know what the general ploidy is of the samples
-                // TODO -- We shouldn't be assuming diploid genotypes here!
-                mWriter.write(VCFConstants.EMPTY_GENOTYPE);
-                continue;
+                missingSampleError(vc, mHeader);
            }

            List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
@ -439,6 +429,13 @@ class VCFWriter extends IndexingVariantContextWriter {
        }
    }

+    public static final void missingSampleError(final VariantContext vc, final VCFHeader header) {
+        final List<String> badSampleNames = new ArrayList<String>();
+        for ( final String x : header.getGenotypeSamples() )
+            if ( ! vc.hasGenotype(x) ) badSampleNames.add(x);
+        throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects.  Missing samples are " + Utils.join(",", badSampleNames));
+    }
+
    private boolean isMissingValue(String s) {
        // we need to deal with the case that it's a list of missing values
        return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length());