diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 5b9a83a1b..29ca1265c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -187,6 +187,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif // the annotation engine private VariantAnnotatorEngine annotationEngine; + private Set samples; + // enable deletions in the pileup @Override public boolean includeReadsWithDeletionAtLoci() { return true; } @@ -231,7 +233,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode"); // get all of the unique sample names - Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // initialize the verbose writer if ( verboseWriter != null ) @@ -298,7 +300,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif * @return the VariantCallContext object */ public List map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext); + return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, samples); } public UGStatistics reduceInit() { return new UGStatistics(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 60fa75f41..3c32d132f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -140,14 +140,39 @@ public class UnifiedGenotyperEngine { } /** - * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set) * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @return the VariantCallContext object + * same as the full call but with allSamples == null + * + * @param tracker + * @param refContext + * @param rawContext + * @return */ - public List calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext) { + return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null); + } + + + /** + * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * + * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype + * for every sample in allSamples. If it's null there's no such guarentee. Providing this + * argument is critical when the resulting calls will be written to a VCF file. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) + * @return the VariantCallContext object + */ + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Set allSamples) { final List results = new ArrayList(2); final List models = getGLModelsToUse(tracker, refContext, rawContext); @@ -168,7 +193,23 @@ public class UnifiedGenotyperEngine { } } - return results; + return addMissingSamples(results, allSamples); + } + + private List addMissingSamples(final List calls, final Set allSamples) { + if ( calls.isEmpty() || allSamples == null ) return calls; + + final List withAllSamples = new ArrayList(calls.size()); + for ( final VariantCallContext call : calls ) { + if ( call == null ) + withAllSamples.add(call); + else { + final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples); + withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit)); + } + } + + return withAllSamples; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java index 22c0131c2..2e3fc26f6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java @@ -261,6 +261,7 @@ public class GenotypeAndValidateWalker extends RodWalker samples; public static class CountedData { private long nAltCalledAlt = 0L; @@ -307,7 +308,7 @@ public class GenotypeAndValidateWalker extends RodWalker header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName()); - Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 6a55b024b..629c7f84c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -174,17 +174,24 @@ public class CombineVariants extends RodWalker { /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ private boolean sitesOnlyVCF = false; + private Set samples; public void initialize() { Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit()); + if ( vcfWriter instanceof VariantContextWriterStub) { + sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); + if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); + } else + logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); + if ( PRIORITY_STRING == null ) { PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING); } validateAnnotateUnionArguments(); - Set samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption); + samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); if ( SET_KEY.toLowerCase().equals("null") ) SET_KEY = null; @@ -194,15 +201,9 @@ public class CombineVariants extends RodWalker { headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); if ( !ASSUME_IDENTICAL_SAMPLES ) headerLines.addAll(Arrays.asList(ChromosomeCounts.descriptions)); - VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples); + VCFHeader vcfHeader = new VCFHeader(headerLines, samples); vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); vcfWriter.writeHeader(vcfHeader); - - if ( vcfWriter instanceof VariantContextWriterStub) { - sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); - if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); - } else - logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); } private void validateAnnotateUnionArguments() { @@ -296,7 +297,7 @@ public class CombineVariants extends RodWalker { VariantContextUtils.calculateChromosomeCounts(builder, false); if ( minimalVCF ) VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(builder.make()); + vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples)); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index b508a9dd5..e8c6794f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -108,6 +108,7 @@ public class VariantsToVCF extends RodWalker { private Set allowedGenotypeFormatStrings = new HashSet(); private boolean wroteHeader = false; + private Set samples; // for dealing with indels in hapmap CloseableIterator dbsnpIterator = null; @@ -228,7 +229,7 @@ public class VariantsToVCF extends RodWalker { } } - Set samples = new LinkedHashSet(); + samples = new LinkedHashSet(); if ( sampleName != null ) { samples.add(sampleName); } else { @@ -252,6 +253,7 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); + vc = VariantContextUtils.addMissingSamples(vc, samples); vcfwriter.add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index b697b3381..ccc0f5971 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -46,6 +46,7 @@ public class VariantContextUtils { public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; + private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); final public static JexlEngine engine = new JexlEngine(); public static final int DEFAULT_PLOIDY = 2; @@ -57,6 +58,31 @@ public class VariantContextUtils { engine.setDebug(false); } + /** + * Ensures that VC contains all of the samples in allSamples by adding missing samples to + * the resulting VC with default diploid ./. genotypes + * + * @param vc + * @param allSamples + * @return + */ + public static VariantContext addMissingSamples(final VariantContext vc, final Set allSamples) { + // TODO -- what's the fastest way to do this calculation? + final Set missingSamples = new HashSet(allSamples); + missingSamples.removeAll(vc.getSampleNames()); + + if ( missingSamples.isEmpty() ) + return vc; + else { + //logger.warn("Adding " + missingSamples.size() + " missing samples to called context"); + final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes()); + for ( final String missing : missingSamples ) { + gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make()); + } + return new VariantContextBuilder(vc).genotypes(gc).make(); + } + } + /** * Update the attributes of the attributes map given the VariantContext to reflect the * proper chromosome-based VCF tags diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 1e15c2bc5..5555849dd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -302,9 +302,7 @@ class BCF2Writer extends IndexingVariantContextWriter { writer.start(encoder, vc); for ( final String name : sampleNames ) { Genotype g = vc.getGenotype(name); - if ( g == null ) - // we don't have any data about g at all - g = new GenotypeBuilder(name).alleles(MISSING_GENOTYPE).make(); + if ( g == null ) VCFWriter.missingSampleError(vc, header); writer.addGenotype(encoder, vc, g); } writer.done(encoder, vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 651223ac3..ee7b1b9ef 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -339,23 +340,12 @@ class VCFWriter extends IndexingVariantContextWriter { */ private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) throws IOException { -// if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) { -// final List badSampleNames = new ArrayList(); -// for ( final Genotype g : vc.getGenotypes() ) -// if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) ) -// badSampleNames.add(g.getSampleName()); -// throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames)); -// } - for ( String sample : mHeader.getGenotypeSamples() ) { mWriter.write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); if ( g == null ) { - // TODO -- The VariantContext needs to know what the general ploidy is of the samples - // TODO -- We shouldn't be assuming diploid genotypes here! - mWriter.write(VCFConstants.EMPTY_GENOTYPE); - continue; + missingSampleError(vc, mHeader); } List attrs = new ArrayList(genotypeFormatKeys.size()); @@ -439,6 +429,13 @@ class VCFWriter extends IndexingVariantContextWriter { } } + public static final void missingSampleError(final VariantContext vc, final VCFHeader header) { + final List badSampleNames = new ArrayList(); + for ( final String x : header.getGenotypeSamples() ) + if ( ! vc.hasGenotype(x) ) badSampleNames.add(x); + throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames)); + } + private boolean isMissingValue(String s) { // we need to deal with the case that it's a list of missing values return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length());