VCFWriter and BCFWriter no longer allow missing samples in the VC compared to their header
-- They now throw an error, as its really unsafe to write out ./. as a special case in the VCFWriter as occurred previously. -- Added convenience method in VariantContextUtils.addMissingSamples(vc, allSamples) that returns a complete VC where samples are given ./. Genotype objects -- This allows us to properly pass tests of creating / writing / reading VCFs and BCFs, which previously differed because the VC from the VCF would actually be different from its original VC -- Updated UG, UGEngine, GenotypeAndValidateWalker, CombineVariants, and VariantsToVCF to manage the master list of samples they are writing out and addMissingSamples via the VCU function
This commit is contained in:
parent
4811a00891
commit
7144154f53
|
|
@ -187,6 +187,8 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
// the annotation engine
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
private Set<String> samples;
|
||||
|
||||
// enable deletions in the pileup
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
|
@ -231,7 +233,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
|
||||
|
||||
// get all of the unique sample names
|
||||
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
// initialize the verbose writer
|
||||
if ( verboseWriter != null )
|
||||
|
|
@ -298,7 +300,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
* @return the VariantCallContext object
|
||||
*/
|
||||
public List<VariantCallContext> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext);
|
||||
return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, samples);
|
||||
}
|
||||
|
||||
public UGStatistics reduceInit() { return new UGStatistics(); }
|
||||
|
|
|
|||
|
|
@ -140,14 +140,39 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
/**
|
||||
* Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
|
||||
* @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set)
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantCallContext object
|
||||
* same as the full call but with allSamples == null
|
||||
*
|
||||
* @param tracker
|
||||
* @param refContext
|
||||
* @param rawContext
|
||||
* @return
|
||||
*/
|
||||
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext refContext,
|
||||
final AlignmentContext rawContext) {
|
||||
return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
|
||||
*
|
||||
* If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype
|
||||
* for every sample in allSamples. If it's null there's no such guarentee. Providing this
|
||||
* argument is critical when the resulting calls will be written to a VCF file.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @param allSamples set of all sample names that we might call (i.e., those in the VCF header)
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext refContext,
|
||||
final AlignmentContext rawContext,
|
||||
final Set<String> allSamples) {
|
||||
final List<VariantCallContext> results = new ArrayList<VariantCallContext>(2);
|
||||
|
||||
final List<GenotypeLikelihoodsCalculationModel.Model> models = getGLModelsToUse(tracker, refContext, rawContext);
|
||||
|
|
@ -168,7 +193,23 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
return addMissingSamples(results, allSamples);
|
||||
}
|
||||
|
||||
private List<VariantCallContext> addMissingSamples(final List<VariantCallContext> calls, final Set<String> allSamples) {
|
||||
if ( calls.isEmpty() || allSamples == null ) return calls;
|
||||
|
||||
final List<VariantCallContext> withAllSamples = new ArrayList<VariantCallContext>(calls.size());
|
||||
for ( final VariantCallContext call : calls ) {
|
||||
if ( call == null )
|
||||
withAllSamples.add(call);
|
||||
else {
|
||||
final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples);
|
||||
withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit));
|
||||
}
|
||||
}
|
||||
|
||||
return withAllSamples;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -261,6 +261,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
|
|||
|
||||
private UnifiedGenotyperEngine snpEngine;
|
||||
private UnifiedGenotyperEngine indelEngine;
|
||||
private Set<String> samples;
|
||||
|
||||
public static class CountedData {
|
||||
private long nAltCalledAlt = 0L;
|
||||
|
|
@ -307,7 +308,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
|
|||
// Initialize VCF header
|
||||
if (vcfWriter != null) {
|
||||
Map<String, VCFHeader> header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName());
|
||||
Set<String> samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
|
||||
samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
|
||||
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(header.values(), logger);
|
||||
headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate"));
|
||||
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||
|
|
|
|||
|
|
@ -174,17 +174,24 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
/** Optimization to strip out genotypes before merging if we are doing a sites_only output */
|
||||
private boolean sitesOnlyVCF = false;
|
||||
private Set<String> samples;
|
||||
|
||||
public void initialize() {
|
||||
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit());
|
||||
|
||||
if ( vcfWriter instanceof VariantContextWriterStub) {
|
||||
sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES);
|
||||
if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
|
||||
} else
|
||||
logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
|
||||
|
||||
if ( PRIORITY_STRING == null ) {
|
||||
PRIORITY_STRING = Utils.join(",", vcfRods.keySet());
|
||||
logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING);
|
||||
}
|
||||
|
||||
validateAnnotateUnionArguments();
|
||||
Set<String> samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
|
||||
samples = sitesOnlyVCF ? Collections.<String>emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
|
||||
|
||||
if ( SET_KEY.toLowerCase().equals("null") )
|
||||
SET_KEY = null;
|
||||
|
|
@ -194,15 +201,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
|
||||
if ( !ASSUME_IDENTICAL_SAMPLES )
|
||||
headerLines.addAll(Arrays.asList(ChromosomeCounts.descriptions));
|
||||
VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.<String>emptySet() : samples);
|
||||
VCFHeader vcfHeader = new VCFHeader(headerLines, samples);
|
||||
vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
if ( vcfWriter instanceof VariantContextWriterStub) {
|
||||
sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES);
|
||||
if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
|
||||
} else
|
||||
logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
|
||||
}
|
||||
|
||||
private void validateAnnotateUnionArguments() {
|
||||
|
|
@ -296,7 +297,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
VariantContextUtils.calculateChromosomeCounts(builder, false);
|
||||
if ( minimalVCF )
|
||||
VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY));
|
||||
vcfWriter.add(builder.make());
|
||||
vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples));
|
||||
}
|
||||
|
||||
return vcs.isEmpty() ? 0 : 1;
|
||||
|
|
|
|||
|
|
@ -108,6 +108,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
|
||||
private Set<String> allowedGenotypeFormatStrings = new HashSet<String>();
|
||||
private boolean wroteHeader = false;
|
||||
private Set<String> samples;
|
||||
|
||||
// for dealing with indels in hapmap
|
||||
CloseableIterator<GATKFeature> dbsnpIterator = null;
|
||||
|
|
@ -228,7 +229,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
Set<String> samples = new LinkedHashSet<String>();
|
||||
samples = new LinkedHashSet<String>();
|
||||
if ( sampleName != null ) {
|
||||
samples.add(sampleName);
|
||||
} else {
|
||||
|
|
@ -252,6 +253,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
|
||||
vc = VariantContextUtils.addMissingSamples(vc, samples);
|
||||
vcfwriter.add(vc);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ public class VariantContextUtils {
|
|||
public final static String MERGE_FILTER_IN_ALL = "FilteredInAll";
|
||||
public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
|
||||
public final static String MERGE_FILTER_PREFIX = "filterIn";
|
||||
private static final List<Allele> DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
|
||||
final public static JexlEngine engine = new JexlEngine();
|
||||
public static final int DEFAULT_PLOIDY = 2;
|
||||
|
|
@ -57,6 +58,31 @@ public class VariantContextUtils {
|
|||
engine.setDebug(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures that VC contains all of the samples in allSamples by adding missing samples to
|
||||
* the resulting VC with default diploid ./. genotypes
|
||||
*
|
||||
* @param vc
|
||||
* @param allSamples
|
||||
* @return
|
||||
*/
|
||||
public static VariantContext addMissingSamples(final VariantContext vc, final Set<String> allSamples) {
|
||||
// TODO -- what's the fastest way to do this calculation?
|
||||
final Set<String> missingSamples = new HashSet<String>(allSamples);
|
||||
missingSamples.removeAll(vc.getSampleNames());
|
||||
|
||||
if ( missingSamples.isEmpty() )
|
||||
return vc;
|
||||
else {
|
||||
//logger.warn("Adding " + missingSamples.size() + " missing samples to called context");
|
||||
final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes());
|
||||
for ( final String missing : missingSamples ) {
|
||||
gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make());
|
||||
}
|
||||
return new VariantContextBuilder(vc).genotypes(gc).make();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the attributes of the attributes map given the VariantContext to reflect the
|
||||
* proper chromosome-based VCF tags
|
||||
|
|
|
|||
|
|
@ -302,9 +302,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
writer.start(encoder, vc);
|
||||
for ( final String name : sampleNames ) {
|
||||
Genotype g = vc.getGenotype(name);
|
||||
if ( g == null )
|
||||
// we don't have any data about g at all
|
||||
g = new GenotypeBuilder(name).alleles(MISSING_GENOTYPE).make();
|
||||
if ( g == null ) VCFWriter.missingSampleError(vc, header);
|
||||
writer.addGenotype(encoder, vc, g);
|
||||
}
|
||||
writer.done(encoder, vc);
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
|
|||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -339,23 +340,12 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
*/
|
||||
private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
|
||||
throws IOException {
|
||||
// if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) {
|
||||
// final List<String> badSampleNames = new ArrayList<String>();
|
||||
// for ( final Genotype g : vc.getGenotypes() )
|
||||
// if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) )
|
||||
// badSampleNames.add(g.getSampleName());
|
||||
// throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames));
|
||||
// }
|
||||
|
||||
for ( String sample : mHeader.getGenotypeSamples() ) {
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
Genotype g = vc.getGenotype(sample);
|
||||
if ( g == null ) {
|
||||
// TODO -- The VariantContext needs to know what the general ploidy is of the samples
|
||||
// TODO -- We shouldn't be assuming diploid genotypes here!
|
||||
mWriter.write(VCFConstants.EMPTY_GENOTYPE);
|
||||
continue;
|
||||
missingSampleError(vc, mHeader);
|
||||
}
|
||||
|
||||
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
|
||||
|
|
@ -439,6 +429,13 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
}
|
||||
}
|
||||
|
||||
public static final void missingSampleError(final VariantContext vc, final VCFHeader header) {
|
||||
final List<String> badSampleNames = new ArrayList<String>();
|
||||
for ( final String x : header.getGenotypeSamples() )
|
||||
if ( ! vc.hasGenotype(x) ) badSampleNames.add(x);
|
||||
throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames));
|
||||
}
|
||||
|
||||
private boolean isMissingValue(String s) {
|
||||
// we need to deal with the case that it's a list of missing values
|
||||
return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length());
|
||||
|
|
|
|||
Loading…
Reference in New Issue