VCF/BCF writers once again automatically write out no-call genotypes for samples in the VCFHeader but not in the VC itself
-- Turns out this was consuming 30% of the UG runtime, and causing problems elsewhere. -- Removed addMissingSamples from VariantcontextUtils, and calls to it -- Updated VCF / BCF writers to automatically write out a diploid no call for missing samples -- Added unit tests for this behavior in VariantContextWritersUnitTest
This commit is contained in:
parent
d3bdb9c67e
commit
91f3204534
|
|
@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.*;
|
|||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -198,23 +197,7 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
}
|
||||
|
||||
return addMissingSamples(results, allSamples);
|
||||
}
|
||||
|
||||
private List<VariantCallContext> addMissingSamples(final List<VariantCallContext> calls, final Set<String> allSamples) {
|
||||
if ( calls.isEmpty() || allSamples == null ) return calls;
|
||||
|
||||
final List<VariantCallContext> withAllSamples = new ArrayList<VariantCallContext>(calls.size());
|
||||
for ( final VariantCallContext call : calls ) {
|
||||
if ( call == null )
|
||||
withAllSamples.add(null);
|
||||
else {
|
||||
final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples);
|
||||
withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit));
|
||||
}
|
||||
}
|
||||
|
||||
return withAllSamples;
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -313,7 +313,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
|
|||
VariantContextUtils.calculateChromosomeCounts(builder, false);
|
||||
if ( minimalVCF )
|
||||
VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY));
|
||||
vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples));
|
||||
vcfWriter.add(builder.make());
|
||||
}
|
||||
|
||||
return vcs.isEmpty() ? 0 : 1;
|
||||
|
|
|
|||
|
|
@ -246,7 +246,6 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
|
||||
vc = VariantContextUtils.addMissingSamples(vc, samples);
|
||||
vcfwriter.add(vc);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -53,6 +53,8 @@ import java.util.*;
|
|||
*/
|
||||
@Invariant({"alleles != null"})
|
||||
public final class GenotypeBuilder {
|
||||
private static final List<Allele> DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
|
||||
private String sampleName = null;
|
||||
private List<Allele> alleles = Collections.emptyList();
|
||||
|
||||
|
|
@ -90,6 +92,17 @@ public final class GenotypeBuilder {
|
|||
return new GenotypeBuilder(sampleName, alleles).PL(gls).make();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Genotype object for a sample that's missing from the VC (i.e., in
|
||||
* the output header). Defaults to a diploid no call genotype ./.
|
||||
*
|
||||
* @param sampleName the name of this sample
|
||||
* @return an initialized Genotype with sampleName that's a diploid ./. no call genotype
|
||||
*/
|
||||
public static Genotype createMissing(final String sampleName) {
|
||||
return new GenotypeBuilder(sampleName).alleles(DIPLOID_NO_CALL).make();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a empty builder. Both a sampleName and alleles must be provided
|
||||
* before trying to make a Genotype from this builder.
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@ import org.apache.log4j.Logger;
|
|||
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
|
|
@ -47,7 +47,6 @@ public class VariantContextUtils {
|
|||
public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
|
||||
public final static String MERGE_FILTER_PREFIX = "filterIn";
|
||||
|
||||
private static final List<Allele> DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
private static Set<String> MISSING_KEYS_WARNED_ABOUT = new HashSet<String>();
|
||||
|
||||
final public static JexlEngine engine = new JexlEngine();
|
||||
|
|
@ -60,31 +59,6 @@ public class VariantContextUtils {
|
|||
engine.setDebug(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures that VC contains all of the samples in allSamples by adding missing samples to
|
||||
* the resulting VC with default diploid ./. genotypes
|
||||
*
|
||||
* @param vc the VariantContext
|
||||
* @param allSamples all of the samples needed
|
||||
* @return a new VariantContext with missing samples added
|
||||
*/
|
||||
public static VariantContext addMissingSamples(final VariantContext vc, final Set<String> allSamples) {
|
||||
// TODO -- what's the fastest way to do this calculation?
|
||||
final Set<String> missingSamples = new HashSet<String>(allSamples);
|
||||
missingSamples.removeAll(vc.getSampleNames());
|
||||
|
||||
if ( missingSamples.isEmpty() )
|
||||
return vc;
|
||||
else {
|
||||
//logger.warn("Adding " + missingSamples.size() + " missing samples to called context");
|
||||
final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes());
|
||||
for ( final String missing : missingSamples ) {
|
||||
gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make());
|
||||
}
|
||||
return new VariantContextBuilder(vc).genotypes(gc).make();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the attributes of the attributes map given the VariantContext to reflect the
|
||||
* proper chromosome-based VCF tags
|
||||
|
|
|
|||
|
|
@ -32,7 +32,10 @@ import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec;
|
|||
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
|
||||
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.bcf2.BCFVersion;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFContigHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
|
@ -345,10 +348,12 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
|
||||
if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT");
|
||||
|
||||
assert writer != null;
|
||||
|
||||
writer.start(encoder, vc);
|
||||
for ( final String name : sampleNames ) {
|
||||
Genotype g = vc.getGenotype(name);
|
||||
if ( g == null ) VCFWriter.missingSampleError(vc, header);
|
||||
if ( g == null ) g = GenotypeBuilder.createMissing(name);
|
||||
writer.addGenotype(encoder, vc, g);
|
||||
}
|
||||
writer.done(encoder, vc);
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
|
|||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -343,9 +342,7 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
Genotype g = vc.getGenotype(sample);
|
||||
if ( g == null ) {
|
||||
missingSampleError(vc, mHeader);
|
||||
}
|
||||
if ( g == null ) g = GenotypeBuilder.createMissing(sample);
|
||||
|
||||
final List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
|
||||
for ( String field : genotypeFormatKeys ) {
|
||||
|
|
@ -426,13 +423,6 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
}
|
||||
}
|
||||
|
||||
public static final void missingSampleError(final VariantContext vc, final VCFHeader header) {
|
||||
final List<String> badSampleNames = new ArrayList<String>();
|
||||
for ( final String x : header.getGenotypeSamples() )
|
||||
if ( ! vc.hasGenotype(x) ) badSampleNames.add(x);
|
||||
throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames));
|
||||
}
|
||||
|
||||
private boolean isMissingValue(String s) {
|
||||
// we need to deal with the case that it's a list of missing values
|
||||
return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length());
|
||||
|
|
|
|||
|
|
@ -596,6 +596,51 @@ public class VariantContextTestProvider {
|
|||
return TEST_DATAs;
|
||||
}
|
||||
|
||||
public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
|
||||
final int nSamples = data.header.getNGenotypeSamples();
|
||||
if ( nSamples > 2 ) {
|
||||
for ( final VariantContext vc : data.vcs )
|
||||
if ( vc.isSymbolic() )
|
||||
// cannot handle symbolic alleles because they may be weird non-call VCFs
|
||||
return;
|
||||
|
||||
final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension());
|
||||
tmpFile.deleteOnExit();
|
||||
|
||||
// write expected to disk
|
||||
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
|
||||
final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
|
||||
|
||||
final Set<String> samplesInVCF = new HashSet<String>(data.header.getGenotypeSamples());
|
||||
final List<String> missingSamples = Arrays.asList("MISSING1", "MISSING2");
|
||||
final List<String> allSamples = new ArrayList<String>(missingSamples);
|
||||
allSamples.addAll(samplesInVCF);
|
||||
|
||||
final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples);
|
||||
writeVCsToFile(writer, header, data.vcs);
|
||||
|
||||
// ensure writing of expected == actual
|
||||
final Pair<VCFHeader, Iterable<VariantContext>> p = readAllVCs(tmpFile, tester.makeCodec());
|
||||
final Iterable<VariantContext> actual = p.getSecond();
|
||||
|
||||
int i = 0;
|
||||
for ( final VariantContext readVC : actual ) {
|
||||
if ( readVC == null ) continue; // sometimes we read null records...
|
||||
final VariantContext expected = data.vcs.get(i++);
|
||||
for ( final Genotype g : readVC.getGenotypes() ) {
|
||||
Assert.assertTrue(allSamples.contains(g.getSampleName()));
|
||||
if ( samplesInVCF.contains(g.getSampleName()) ) {
|
||||
assertEquals(g, expected.getGenotype(g.getSampleName()));
|
||||
} else {
|
||||
// missing
|
||||
Assert.assertTrue(g.isNoCall());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
|
||||
testReaderWriter(tester, data.header, data.vcs, data.vcs, true);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -82,6 +82,11 @@ public class VariantContextWritersUnitTest extends BaseTest {
|
|||
VariantContextTestProvider.testReaderWriter(new BCFIOTester(), testData);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VariantContextTest_SingleContexts")
|
||||
public void testBCF2WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException {
|
||||
VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new BCFIOTester(), testData);
|
||||
}
|
||||
|
||||
private class BCFIOTester extends VariantContextTestProvider.VariantContextIOTest {
|
||||
@Override
|
||||
public String getExtension() {
|
||||
|
|
@ -110,6 +115,11 @@ public class VariantContextWritersUnitTest extends BaseTest {
|
|||
VariantContextTestProvider.testReaderWriter(new VCFIOTester(), testData);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts")
|
||||
public void testVCF4WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException {
|
||||
VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new VCFIOTester(), testData);
|
||||
}
|
||||
|
||||
private class VCFIOTester extends VariantContextTestProvider.VariantContextIOTest {
|
||||
@Override
|
||||
public String getExtension() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue