VCFWriter and BCFWriter no longer allow missing samples in the VC compared to their header

-- They now throw an error, as its really unsafe to write out ./. as a special case in the VCFWriter as occurred previously.
-- Added convenience method in VariantContextUtils.addMissingSamples(vc, allSamples) that returns a complete VC where samples are given ./. Genotype objects
-- This allows us to properly pass tests of creating / writing / reading VCFs and BCFs, which previously differed because the VC from the VCF would actually be different from its original VC
-- Updated UG, UGEngine, GenotypeAndValidateWalker, CombineVariants, and VariantsToVCF to manage the master list of samples they are writing out and addMissingSamples via the VCU function
This commit is contained in:
Mark DePristo 2012-06-27 17:39:00 -04:00
parent 4811a00891
commit 7144154f53
8 changed files with 103 additions and 35 deletions

View File

@ -187,6 +187,8 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
private Set<String> samples;
// enable deletions in the pileup
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
@ -231,7 +233,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
// get all of the unique sample names
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
// initialize the verbose writer
if ( verboseWriter != null )
@ -298,7 +300,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
* @return the VariantCallContext object
*/
public List<VariantCallContext> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext);
return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, samples);
}
public UGStatistics reduceInit() { return new UGStatistics(); }

View File

@ -140,14 +140,39 @@ public class UnifiedGenotyperEngine {
}
/**
* Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
* @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set)
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantCallContext object
* same as the full call but with allSamples == null
*
* @param tracker
* @param refContext
* @param rawContext
* @return
*/
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext) {
return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null);
}
/**
* Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
*
* If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype
* for every sample in allSamples. If it's null there's no such guarentee. Providing this
* argument is critical when the resulting calls will be written to a VCF file.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @param allSamples set of all sample names that we might call (i.e., those in the VCF header)
* @return the VariantCallContext object
*/
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext,
final Set<String> allSamples) {
final List<VariantCallContext> results = new ArrayList<VariantCallContext>(2);
final List<GenotypeLikelihoodsCalculationModel.Model> models = getGLModelsToUse(tracker, refContext, rawContext);
@ -168,7 +193,23 @@ public class UnifiedGenotyperEngine {
}
}
return results;
return addMissingSamples(results, allSamples);
}
private List<VariantCallContext> addMissingSamples(final List<VariantCallContext> calls, final Set<String> allSamples) {
if ( calls.isEmpty() || allSamples == null ) return calls;
final List<VariantCallContext> withAllSamples = new ArrayList<VariantCallContext>(calls.size());
for ( final VariantCallContext call : calls ) {
if ( call == null )
withAllSamples.add(call);
else {
final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples);
withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit));
}
}
return withAllSamples;
}
/**

View File

@ -261,6 +261,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
private UnifiedGenotyperEngine snpEngine;
private UnifiedGenotyperEngine indelEngine;
private Set<String> samples;
public static class CountedData {
private long nAltCalledAlt = 0L;
@ -307,7 +308,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
// Initialize VCF header
if (vcfWriter != null) {
Map<String, VCFHeader> header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName());
Set<String> samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(header.values(), logger);
headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate"));
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));

View File

@ -174,17 +174,24 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
/** Optimization to strip out genotypes before merging if we are doing a sites_only output */
private boolean sitesOnlyVCF = false;
private Set<String> samples;
public void initialize() {
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit());
if ( vcfWriter instanceof VariantContextWriterStub) {
sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES);
if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
} else
logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
if ( PRIORITY_STRING == null ) {
PRIORITY_STRING = Utils.join(",", vcfRods.keySet());
logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING);
}
validateAnnotateUnionArguments();
Set<String> samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
samples = sitesOnlyVCF ? Collections.<String>emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
if ( SET_KEY.toLowerCase().equals("null") )
SET_KEY = null;
@ -194,15 +201,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
if ( !ASSUME_IDENTICAL_SAMPLES )
headerLines.addAll(Arrays.asList(ChromosomeCounts.descriptions));
VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.<String>emptySet() : samples);
VCFHeader vcfHeader = new VCFHeader(headerLines, samples);
vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER);
vcfWriter.writeHeader(vcfHeader);
if ( vcfWriter instanceof VariantContextWriterStub) {
sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES);
if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
} else
logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
}
private void validateAnnotateUnionArguments() {
@ -296,7 +297,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
VariantContextUtils.calculateChromosomeCounts(builder, false);
if ( minimalVCF )
VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY));
vcfWriter.add(builder.make());
vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples));
}
return vcs.isEmpty() ? 0 : 1;

View File

@ -108,6 +108,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
private Set<String> allowedGenotypeFormatStrings = new HashSet<String>();
private boolean wroteHeader = false;
private Set<String> samples;
// for dealing with indels in hapmap
CloseableIterator<GATKFeature> dbsnpIterator = null;
@ -228,7 +229,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
}
}
Set<String> samples = new LinkedHashSet<String>();
samples = new LinkedHashSet<String>();
if ( sampleName != null ) {
samples.add(sampleName);
} else {
@ -252,6 +253,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
}
vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
vc = VariantContextUtils.addMissingSamples(vc, samples);
vcfwriter.add(vc);
}

View File

@ -46,6 +46,7 @@ public class VariantContextUtils {
public final static String MERGE_FILTER_IN_ALL = "FilteredInAll";
public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
public final static String MERGE_FILTER_PREFIX = "filterIn";
private static final List<Allele> DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
final public static JexlEngine engine = new JexlEngine();
public static final int DEFAULT_PLOIDY = 2;
@ -57,6 +58,31 @@ public class VariantContextUtils {
engine.setDebug(false);
}
/**
* Ensures that VC contains all of the samples in allSamples by adding missing samples to
* the resulting VC with default diploid ./. genotypes
*
* @param vc
* @param allSamples
* @return
*/
public static VariantContext addMissingSamples(final VariantContext vc, final Set<String> allSamples) {
// TODO -- what's the fastest way to do this calculation?
final Set<String> missingSamples = new HashSet<String>(allSamples);
missingSamples.removeAll(vc.getSampleNames());
if ( missingSamples.isEmpty() )
return vc;
else {
//logger.warn("Adding " + missingSamples.size() + " missing samples to called context");
final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes());
for ( final String missing : missingSamples ) {
gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make());
}
return new VariantContextBuilder(vc).genotypes(gc).make();
}
}
/**
* Update the attributes of the attributes map given the VariantContext to reflect the
* proper chromosome-based VCF tags

View File

@ -302,9 +302,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
writer.start(encoder, vc);
for ( final String name : sampleNames ) {
Genotype g = vc.getGenotype(name);
if ( g == null )
// we don't have any data about g at all
g = new GenotypeBuilder(name).alleles(MISSING_GENOTYPE).make();
if ( g == null ) VCFWriter.missingSampleError(vc, header);
writer.addGenotype(encoder, vc, g);
}
writer.done(encoder, vc);

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -339,23 +340,12 @@ class VCFWriter extends IndexingVariantContextWriter {
*/
private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
throws IOException {
// if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) {
// final List<String> badSampleNames = new ArrayList<String>();
// for ( final Genotype g : vc.getGenotypes() )
// if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) )
// badSampleNames.add(g.getSampleName());
// throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames));
// }
for ( String sample : mHeader.getGenotypeSamples() ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
Genotype g = vc.getGenotype(sample);
if ( g == null ) {
// TODO -- The VariantContext needs to know what the general ploidy is of the samples
// TODO -- We shouldn't be assuming diploid genotypes here!
mWriter.write(VCFConstants.EMPTY_GENOTYPE);
continue;
missingSampleError(vc, mHeader);
}
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
@ -439,6 +429,13 @@ class VCFWriter extends IndexingVariantContextWriter {
}
}
public static final void missingSampleError(final VariantContext vc, final VCFHeader header) {
final List<String> badSampleNames = new ArrayList<String>();
for ( final String x : header.getGenotypeSamples() )
if ( ! vc.hasGenotype(x) ) badSampleNames.add(x);
throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames));
}
private boolean isMissingValue(String s) {
// we need to deal with the case that it's a list of missing values
return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length());