Phase I commit to get shadowBCFs passing tests

-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header.  This helps avoid some of the low-level errors I saw in SelectVariants.  This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
    -- using arrays not lists for intermediate data structures
    -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
    -- Warn and fix on the way flag values with counts > 0
    -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow.  Duplicates are detected once at header creation.
    -- Explicitly track FILTER fields for efficient lookup in their own hashmap
    -- Automatically add PL field when we see a GL field and no PL field
    -- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing.  Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
This commit is contained in:
Mark DePristo 2012-06-15 14:25:00 -04:00
parent ab53220635
commit 9c81f45c9f
25 changed files with 344 additions and 133 deletions

View File

@ -347,6 +347,9 @@ public class GATKArgumentCollection {
public boolean USE_SLOW_GENOTYPES = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
@Argument(fullName="allowMissingVCFHeaders",shortName = "allowMissingVCFHeaders",doc="If provided, the GATK will write out VCF files that contain INFO, FILTER, and FORMAT fields not found in the VCF header",required=false)
public boolean allowMissingVCFHeaders = false;
/**
* The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
* and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other

View File

@ -74,7 +74,8 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
else if ( stub.getOutputStream() != null ) {
this.file = null;
this.stream = stub.getOutputStream();
writer = VariantContextWriterFactory.create(stream, stub.getMasterSequenceDictionary(), stub.getWriterOptions(false));
writer = VariantContextWriterFactory.create(stream,
stub.getMasterSequenceDictionary(), stub.getWriterOptions(false));
}
else
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");

View File

@ -183,6 +183,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
List<Options> options = new ArrayList<Options>();
if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES);
if ( engine.getArguments().allowMissingVCFHeaders ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY);
return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options);

View File

@ -68,9 +68,10 @@ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements Standa
}
public static final String[] keyNames = {STR_PRESENT, REPEAT_UNIT_KEY,REPEATS_PER_ALLELE_KEY };
public static final VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(STR_PRESENT, 1, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"),
public static final VCFInfoHeaderLine[] descriptions = {
new VCFInfoHeaderLine(STR_PRESENT, 0, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"),
new VCFInfoHeaderLine(REPEAT_UNIT_KEY, 1, VCFHeaderLineType.String, "Tandem repeat unit (bases)"),
new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") };
new VCFInfoHeaderLine(REPEATS_PER_ALLELE_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") };
public List<String> getKeyNames() {
return Arrays.asList(keyNames);

View File

@ -203,8 +203,9 @@ public class VariantAnnotatorEngine {
// go through all the requested info annotationTypes
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
Map<String, Object> annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc);
if ( annotationsFromCurrentType != null )
if ( annotationsFromCurrentType != null ) {
infoAnnotations.putAll(annotationsFromCurrentType);
}
}
// generate a new annotated VC

View File

@ -198,8 +198,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// Variables
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
private Set<String> sampleNamesForEvaluation = new TreeSet<String>();
private Set<String> sampleNamesForStratification = new TreeSet<String>();
private boolean isSubsettingSamples;
private Set<String> sampleNamesForEvaluation = new LinkedHashSet<String>();
private Set<String> sampleNamesForStratification = new LinkedHashSet<String>();
// important stratifications
private boolean byFilterIsEnabled = false;
@ -249,8 +250,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
// Load the sample list
sampleNamesForEvaluation.addAll(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS));
// Load the sample list, using an intermediate tree set to sort the samples
final Set<String> allSampleNames = SampleUtils.getSamplesFromCommandLineInput(vcfSamples);
sampleNamesForEvaluation.addAll(new TreeSet<String>(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS)));
isSubsettingSamples = ! sampleNamesForEvaluation.containsAll(allSampleNames);
if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
sampleNamesForStratification.addAll(sampleNamesForEvaluation);
@ -571,6 +574,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
public List<RodBinding<VariantContext>> getEvals() { return evals; }
public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; }
public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }
public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; }

View File

@ -28,8 +28,6 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.StandardEval;
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
@ -37,13 +35,13 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.Require
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.StandardStratification;
import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.lang.reflect.Field;
import java.util.*;
public class VariantEvalUtils {
@ -199,18 +197,32 @@ public class VariantEvalUtils {
* @return a new VariantContext with just the requested samples
*/
public VariantContext getSubsetOfVariantContext(VariantContext vc, Set<String> sampleNames) {
VariantContext vcsub = vc.subContextFromSamples(sampleNames, false);
VariantContextBuilder builder = new VariantContextBuilder(vcsub);
return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, false));
}
public VariantContext ensureAnnotations(final VariantContext vc, final VariantContext vcsub) {
final int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount();
final int newAlleleCount = vcsub.getHetCount() + 2 * vcsub.getHomVarCount();
final boolean isSingleton = originalAlleleCount == newAlleleCount && newAlleleCount == 1;
final boolean hasChrCountAnnotations = vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) &&
vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) &&
vc.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY);
if (originalAlleleCount == newAlleleCount && newAlleleCount == 1) {
builder.attribute(VariantEvalWalker.IS_SINGLETON_KEY, true);
if ( ! isSingleton && hasChrCountAnnotations ) {
// nothing to update
return vc;
} else {
// have to do the work
VariantContextBuilder builder = new VariantContextBuilder(vc);
if ( isSingleton )
builder.attribute(VariantEvalWalker.IS_SINGLETON_KEY, true);
if ( ! hasChrCountAnnotations )
VariantContextUtils.calculateChromosomeCounts(builder, true);
return builder.make();
}
VariantContextUtils.calculateChromosomeCounts(builder, true);
return builder.make();
}
/**
@ -250,8 +262,11 @@ public class VariantEvalUtils {
// First, filter the VariantContext to represent only the samples for evaluation
VariantContext vcsub = vc;
if (subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation())) {
vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
if (subsetBySample && vc.hasGenotypes()) {
if ( variantEvalWalker.isSubsettingToSpecificSamples() )
vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
else
vcsub = ensureAnnotations(vc, vc);
}
if ((byFilter || !vcsub.isFiltered())) {

View File

@ -427,12 +427,12 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
if (KEEP_ORIGINAL_CHR_COUNTS) {
headerLines.add(new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC"));
headerLines.add(new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF"));
headerLines.add(new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN"));
headerLines.add(new VCFInfoHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC"));
headerLines.add(new VCFInfoHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF"));
headerLines.add(new VCFInfoHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN"));
}
headerLines.addAll(Arrays.asList(ChromosomeCounts.descriptions));
headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Depth of coverage"));
headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Depth of coverage"));
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) {

View File

@ -150,8 +150,7 @@ public class SampleUtils {
// iterate to get all of the sample names
for ( Map.Entry<String, VCFHeader> pair : VCFUtils.getVCFHeadersFromRods(toolkit).entrySet() ) {
Set<String> vcfSamples = pair.getValue().getGenotypeSamples();
for ( String sample : vcfSamples )
for ( String sample : pair.getValue().getGenotypeSamples() )
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey());
}
}

View File

@ -79,6 +79,14 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
*/
private BCF2GenotypeFieldDecoders gtFieldDecoders = null;
/**
* A cached array of GenotypeBuilders for efficient genotype decoding.
*
* Caching it allows us to avoid recreating this intermediate data
* structure each time we decode genotypes
*/
private GenotypeBuilder[] builders = null;
// for error handling
private int recordNo = 0;
private int pos = 0;
@ -168,6 +176,13 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
// prepare the genotype field decoders
gtFieldDecoders = new BCF2GenotypeFieldDecoders(header);
// create and initialize the genotype builder array
final int nSamples = header.getNGenotypeSamples();
builders = new GenotypeBuilder[nSamples];
for ( int i = 0; i < nSamples; i++ ) {
builders[i] = new GenotypeBuilder(header.getGenotypeSamples().get(i));
}
// position right before next line (would be right before first real record byte at end of header)
return new FeatureCodecHeader(header, inputStream.getPosition());
}
@ -256,6 +271,11 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
final int nFormatFields = nFormatSamples >> 24;
final int nSamples = nFormatSamples & 0x00FFFFF;
if ( header.getNGenotypeSamples() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + header.getNGenotypeSamples() +
" samples in header but have a record with " + nSamples + " samples");
decodeID(builder);
final ArrayList<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
decodeFilter(builder);
@ -416,11 +436,11 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
final VariantContextBuilder builder ) {
if (siteInfo.nSamples > 0) {
final LazyGenotypesContext.LazyParser lazyParser =
new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields);
final int nGenotypes = header.getGenotypeSamples().size();
new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders);
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser,
new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()),
nGenotypes);
header.getNGenotypeSamples());
// did we resort the sample names? If so, we need to load the genotype data
if ( !header.samplesWereAlreadySorted() )

View File

@ -99,21 +99,21 @@ public class BCF2GenotypeFieldDecoders {
*/
public interface Decoder {
@Requires({"siteAlleles != null", "! siteAlleles.isEmpty()",
"field != null", "decoder != null", "gbs != null", "! gbs.isEmpty()"})
"field != null", "decoder != null", "gbs != null", "gbs.length != 0"})
public void decode(final List<Allele> siteAlleles,
final String field,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs);
final GenotypeBuilder[] gbs);
}
private class GTDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
// we have to do a bit of low-level processing here as we want to know the size upfronta
final int ploidy = decoder.decodeNumberOfElements(typeDescriptor);
if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.size() >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs);
else {
generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs);
@ -137,7 +137,7 @@ public class BCF2GenotypeFieldDecoders {
private final void fastBiallelicDiploidDecode(final List<Allele> siteAlleles,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
final GenotypeBuilder[] gbs) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
final int nPossibleGenotypes = 3 * 3;
@ -176,7 +176,7 @@ public class BCF2GenotypeFieldDecoders {
final int ploidy,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
final GenotypeBuilder[] gbs) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
// a single cache for the encoded genotypes, since we don't actually need this vector
@ -213,7 +213,7 @@ public class BCF2GenotypeFieldDecoders {
private class DPDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
for ( final GenotypeBuilder gb : gbs ) {
// the -1 is for missing
gb.DP(decoder.decodeInt(typeDescriptor, -1));
@ -223,7 +223,7 @@ public class BCF2GenotypeFieldDecoders {
private class GQDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
for ( final GenotypeBuilder gb : gbs ) {
// the -1 is for missing
gb.GQ(decoder.decodeInt(typeDescriptor, -1));
@ -233,7 +233,7 @@ public class BCF2GenotypeFieldDecoders {
private class ADDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
for ( final GenotypeBuilder gb : gbs ) {
gb.AD(decoder.decodeIntArray(typeDescriptor));
}
@ -242,7 +242,7 @@ public class BCF2GenotypeFieldDecoders {
private class PLDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
for ( final GenotypeBuilder gb : gbs ) {
gb.PL(decoder.decodeIntArray(typeDescriptor));
}
@ -251,7 +251,7 @@ public class BCF2GenotypeFieldDecoders {
private class GenericDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values
@ -270,7 +270,7 @@ public class BCF2GenotypeFieldDecoders {
private class FTDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -46,12 +47,16 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
private final ArrayList<Allele> siteAlleles;
private final int nSamples;
private final int nFields;
private final GenotypeBuilder[] builders;
BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples, final int nFields) {
@Requires("codec.getHeader().getNGenotypeSamples() == builders.length")
BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples,
final int nFields, final GenotypeBuilder[] builders) {
this.codec = codec;
this.siteAlleles = alleles;
this.nSamples = nSamples;
this.nFields = nFields;
this.builders = builders;
}
@Override
@ -62,21 +67,8 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
// load our byte[] data into the decoder
final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);
// TODO -- fast path for sites only
// go ahead and decode everyone
final List<String> samples = new ArrayList<String>(codec.getHeader().getGenotypeSamples());
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
// create and initialize the genotypes array
final ArrayList<GenotypeBuilder> builders = new ArrayList<GenotypeBuilder>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
builders.add(new GenotypeBuilder(samples.get(i)));
}
for ( int i = 0; i < nSamples; i++ )
builders[i].reset(true);
for ( int i = 0; i < nFields; i++ ) {
// get the field name

View File

@ -344,7 +344,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
// do we have genotyping data
if (parts.length > NUM_STANDARD_FIELDS) {
final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos);
final int nGenotypes = header.getGenotypeSamples().size();
final int nGenotypes = header.getNGenotypeSamples();
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes);
// did we resort the sample names? If so, we need to load the genotype data

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import org.apache.log4j.Logger;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -35,6 +36,8 @@ import java.util.Map;
* a base class for compound header lines, which include info lines and format lines (so far)
*/
public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine {
final protected static Logger logger = Logger.getLogger(VCFHeader.class);
public enum SupportedHeaderLineType {
INFO(true), FORMAT(false);
@ -172,6 +175,11 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
if ( name == null || type == null || description == null || lineType == null )
throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s",
super.getKey(), name, type, description, lineType ));
if ( type == VCFHeaderLineType.Flag && count != 0 ) {
count = 0;
logger.warn("FLAG fields must have a count value of 0, but saw " + count + " for header line " + getID() + ". Changing it to 0 inside the code");
}
}
/**

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.codecs.vcf;
import org.apache.log4j.Logger;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
@ -54,11 +55,12 @@ public class VCFHeader {
private final Set<VCFHeaderLine> mMetaData = new TreeSet<VCFHeaderLine>();
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new HashMap<String, VCFFilterHeaderLine>();
private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
private final List<VCFContigHeaderLine> contigMetaData = new ArrayList<VCFContigHeaderLine>();
// the list of auxillary tags
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
private final List<String> mGenotypeSampleNames = new ArrayList<String>();
// the character string that indicates meta data
public static final String METADATA_INDICATOR = "##";
@ -106,7 +108,15 @@ public class VCFHeader {
* @param genotypeSampleNames the sample names
*/
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
this(metaData, new ArrayList<String>(genotypeSampleNames));
}
public VCFHeader(Set<VCFHeaderLine> metaData, List<String> genotypeSampleNames) {
this(metaData);
if ( genotypeSampleNames.size() != new HashSet<String>(genotypeSampleNames).size() )
throw new ReviewedStingException("BUG: VCF header has duplicate sample names");
mGenotypeSampleNames.addAll(genotypeSampleNames);
samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames);
buildVCFReaderMaps(genotypeSampleNames);
@ -175,12 +185,23 @@ public class VCFHeader {
} else if ( line instanceof VCFFormatHeaderLine ) {
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
addMetaDataMapBinding(mFormatMetaData, formatLine);
} else if ( line instanceof VCFFilterHeaderLine ) {
VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line;
mFilterMetaData.put(filterLine.getID(), filterLine);
} else if ( line instanceof VCFContigHeaderLine ) {
contigMetaData.add((VCFContigHeaderLine)line);
} else {
mOtherMetaData.put(line.getKey(), line);
}
}
if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
logger.warn("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no "
+ VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY + " field. As the GATK now only manages PL fields internally"
+ " automatically adding a corresponding PL field to your VCF header");
addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
loadMetaDataMaps();
}
}
/**
@ -239,7 +260,7 @@ public class VCFHeader {
*
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
*/
public Set<String> getGenotypeSamples() {
public List<String> getGenotypeSamples() {
return mGenotypeSampleNames;
}
@ -294,6 +315,26 @@ public class VCFHeader {
return mFormatMetaData.get(id);
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFFilterHeaderLine getFilterHeaderLine(final String id) {
return mFilterMetaData.get(id);
}
public boolean hasInfoLine(final String id) {
return getInfoHeaderLine(id) != null;
}
public boolean hasFormatLine(final String id) {
return getFormatHeaderLine(id) != null;
}
public boolean hasFilterLine(final String id) {
return getFilterHeaderLine(id) != null;
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
@ -49,6 +50,7 @@ import java.util.*;
* @author Mark DePristo
* @since 06/12
*/
@Invariant({"alleles != null"})
public final class GenotypeBuilder {
public static boolean MAKE_FAST_BY_DEFAULT = true;
@ -154,9 +156,9 @@ public final class GenotypeBuilder {
* function you must provide sampleName and alleles before trying to
* make more Genotypes.
*/
public final void reset() {
sampleName = null;
alleles = null;
public final void reset(final boolean keepSampleName) {
if ( ! keepSampleName ) sampleName = null;
alleles = Collections.emptyList();
isPhased = false;
GQ = -1;
DP = -1;

View File

@ -119,10 +119,6 @@ public class VariantContextUtils {
attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
}
else {
attributes.put(VCFConstants.ALLELE_COUNT_KEY, 0);
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0);
}
}
return attributes;

View File

@ -260,7 +260,7 @@ public abstract class BCF2FieldEncoder {
@Requires("isDynamicallyTyped()")
@Ensures("result != null")
public BCF2Type getDynamicType(final Object value) {
throw new ReviewedStingException("BUG: cannot get dynamic type for statically typed BCF2 field");
throw new ReviewedStingException("BUG: cannot get dynamic type for statically typed BCF2 field " + getField());
}
// ----------------------------------------------------------------------
@ -367,7 +367,7 @@ public abstract class BCF2FieldEncoder {
public Flag(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, BCF2Type.INT8);
if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 )
throw new ReviewedStingException("Flag encoder only suppports atomic flags!");
throw new ReviewedStingException("Flag encoder only suppports atomic flags for field " + getField());
}
@Override

View File

@ -33,5 +33,6 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
public enum Options {
INDEX_ON_THE_FLY,
DO_NOT_WRITE_GENOTYPES,
ALLOW_MISSING_FIELDS_IN_HEADER,
FORCE_BCF
}

View File

@ -27,9 +27,9 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.*;
@ -54,12 +54,17 @@ class VCFWriter extends IndexingVariantContextWriter {
// were filters applied?
protected boolean filtersWereAppliedToContext = false;
final private boolean allowMissingFieldsInHeader;
private IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();
public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes,
final boolean allowMissingFieldsInHeader ) {
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
this.doNotWriteGenotypes = doNotWriteGenotypes;
this.allowMissingFieldsInHeader = allowMissingFieldsInHeader;
}
// --------------------------------------------------------------------------------
@ -222,6 +227,10 @@ class VCFWriter extends IndexingVariantContextWriter {
Map<String, String> infoFields = new TreeMap<String, String>();
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
String key = field.getKey();
if ( ! mHeader.hasInfoLine(key) )
fieldIsMissingFromHeaderError(vc, key, "INFO");
String outputValue = formatVCFField(field.getValue());
if ( outputValue != null )
infoFields.put(key, outputValue);
@ -236,6 +245,10 @@ class VCFWriter extends IndexingVariantContextWriter {
} else {
List<String> genotypeAttributeKeys = calcVCFGenotypeKeys(vc, mHeader);
if ( ! genotypeAttributeKeys.isEmpty() ) {
for ( final String format : genotypeAttributeKeys )
if ( ! mHeader.hasFormatLine(format) )
fieldIsMissingFromHeaderError(vc, format, "FORMAT");
final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
@ -270,12 +283,18 @@ class VCFWriter extends IndexingVariantContextWriter {
//
// --------------------------------------------------------------------------------
public static final String getFilterString(final VariantContext vc) {
return getFilterString(vc, false);
}
private final String getFilterString(final VariantContext vc, boolean forcePASS) {
if ( vc.isFiltered() ) {
for ( final String filter : vc.getFilters() )
if ( ! mHeader.hasFilterLine(filter) )
fieldIsMissingFromHeaderError(vc, filter, "FILTER");
public static final String getFilterString(final VariantContext vc, boolean forcePASS) {
return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
return ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters()));
}
else if ( forcePASS || vc.filtersWereApplied() )
return VCFConstants.PASSES_FILTERS_v4;
else
return VCFConstants.UNFILTERED;
}
private static final String QUAL_FORMAT_STRING = "%.2f";
@ -330,13 +349,13 @@ class VCFWriter extends IndexingVariantContextWriter {
*/
private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
throws IOException {
if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) {
final List<String> badSampleNames = new ArrayList<String>();
for ( final Genotype g : vc.getGenotypes() )
if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) )
badSampleNames.add(g.getSampleName());
throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames));
}
// if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) {
// final List<String> badSampleNames = new ArrayList<String>();
// for ( final Genotype g : vc.getGenotypes() )
// if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) )
// badSampleNames.add(g.getSampleName());
// throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames));
// }
for ( String sample : mHeader.getGenotypeSamples() ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
@ -553,4 +572,13 @@ class VCFWriter extends IndexingVariantContextWriter {
}
return count;
}
private final void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) {
if ( !allowMissingFieldsInHeader)
throw new UserException.MalformedVCFHeader("Key " + id + " found in VariantContext field " + field
+ " at " + vc.getChr() + ":" + vc.getStart()
+ " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have"
+ " complete VCF headers by default. This error can be disabled with the engine argument"
+ " --allowMissingVCFHeaders");
}
}

View File

@ -79,7 +79,8 @@ public class VariantContextWriterFactory {
else {
return new VCFWriter(location, output, refDict,
options.contains(Options.INDEX_ON_THE_FLY),
options.contains(Options.DO_NOT_WRITE_GENOTYPES));
options.contains(Options.DO_NOT_WRITE_GENOTYPES),
options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER));
}
}

View File

@ -277,7 +277,7 @@ public abstract class BaseTest {
Reporter.log(message, true);
}
private static final double DEFAULT_FLOAT_TOLERANCE = 1e-4;
private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1;
public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) {
Assert.assertTrue(actual instanceof Double);

View File

@ -51,7 +51,7 @@ import java.text.SimpleDateFormat;
import java.util.*;
public class WalkerTest extends BaseTest {
private static final boolean GENERATE_SHADOW_BCF = false;
private static final boolean GENERATE_SHADOW_BCF = true;
private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false;
private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false;

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
import java.util.Arrays;
@ -15,9 +16,11 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String testFile = testDir + "NA12878.hg19.example1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header",
"-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant "
+ b37hapmapGenotypes + " -disc " + testFile
+ " -o %s --no_cmdline_in_header --allowMissingVCFHeaders --allowMissingVCFHeaders",
1,
Arrays.asList("133fd0ded0bb213097cbe68995afbb7e")
Arrays.asList("18ca4d3e69874c5e55377ca2ad7c6014")
);
spec.disableShadowBCF();
@ -31,7 +34,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -sn B -sn C --variant " + testfile),
1,
Arrays.asList("b2ee12588ebda200727762a903b8c972")
Arrays.asList("ddb082f5f15944b7545b45c20440a474")
);
executeTest("testRepeatedLineSelection--" + testfile, spec);
@ -42,9 +45,11 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String testFile = testDir + "NA12878.hg19.example1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header",
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant "
+ b37hapmapGenotypes + " -disc " + testFile
+ " -o %s --no_cmdline_in_header --allowMissingVCFHeaders",
1,
Arrays.asList("f64c90c4cca470f1095d9fa2062eac3e")
Arrays.asList("2e0eaed94e2ab14349b967dc89a09823")
);
spec.disableShadowBCF();
@ -57,9 +62,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile),
1,
Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4")
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile),
1,
Arrays.asList("88c142737080847e44dc8e8c982cfc74")
);
spec.disableShadowBCF();
executeTest("testComplexSelection--" + testfile, spec);
@ -71,9 +76,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile,
1,
Arrays.asList("b24f31db48d254d8fe15295955173486")
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile,
1,
Arrays.asList("338bfc635667793b89c349446982a36d")
);
spec.disableShadowBCF();
@ -86,9 +91,11 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String testFile = testDir + "NA12878.hg19.example1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + b37hapmapGenotypes + " --variant " + testFile + " -o %s --no_cmdline_in_header",
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc "
+ b37hapmapGenotypes + " --variant " + testFile
+ " -o %s --no_cmdline_in_header --allowMissingVCFHeaders",
1,
Arrays.asList("9da5dab3d344c1c0a5987b15e60fa082")
Arrays.asList("7d2c7deeacf307d140a85a34629c6539")
);
spec.disableShadowBCF();
@ -102,7 +109,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("30b89b3a6706f7f46b23bfb3be69cc8e")
Arrays.asList("f0509f4c2c6c9eff3bec8171f00762b3")
);
executeTest("testVariantTypeSelection--" + testFile, spec);
@ -115,7 +122,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("8bf557aaa07eccb294c81f491225bf9e")
Arrays.asList("9162a67ccb4201c0542f30d14967f2d5")
);
executeTest("testUsingDbsnpName--" + testFile, spec);
@ -141,7 +148,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -select 'KG_FREQ < 0.5' --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("cb9932f9a7aa2e53af605b30d88ad43f")
Arrays.asList("7dabe6910a40a9db5006c6af66b4617c")
);
executeTest("testMultipleRecordsAtOnePosition--" + testFile, spec);
@ -154,7 +161,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("920605cc2182026e3f54c009f6a04141")
Arrays.asList("034965918283d5597bb443e65cd20cf8")
);
executeTest("testNoGTs--" + testFile, spec);
@ -167,9 +174,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec;
spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"),
1,
Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4")
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"),
1,
Arrays.asList("88c142737080847e44dc8e8c982cfc74")
);
spec.disableShadowBCF();
executeTest("testParallelization (2 threads)--" + testfile, spec);
@ -177,13 +184,13 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
@Test(enabled = false)
public void testParallelization4() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec;
spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"),
1,
Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4")
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec;
spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"),
1,
Arrays.asList("88c142737080847e44dc8e8c982cfc74")
);
spec.disableShadowBCF();
@ -197,8 +204,32 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
1,
Arrays.asList("2f2a342812ba914bcce666e42ef761d7")
Arrays.asList("3cbabcd256070ea113ddc04fef3b6900")
);
executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec);
}
@Test()
public void testFileWithoutInfoLineInHeader() {
testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeader", UserException.class);
}
@Test()
public void testFileWithoutInfoLineInHeaderWithOverride() {
testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeaderWithOverride", null);
}
private void testFileWithoutInfoLineInHeader(final String name, final Class expectedException) {
final String testFile = testDir + "missingHeaderLine.vcf";
final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp "
+ testFile + " -o %s --no_cmdline_in_header"
+ (expectedException == null ? " -allowMissingVCFHeaders" : "");
WalkerTestSpec spec =
expectedException != null
? new WalkerTestSpec(cmd, 1, expectedException)
: new WalkerTestSpec(cmd, 1, Arrays.asList(""));
spec.disableShadowBCF();
executeTest(name, spec);
}
}

View File

@ -142,16 +142,16 @@ public class VariantContextTestProvider {
if ( ENABLE_SOURCE_VCF_TESTS ) {
for ( final File file : testSourceVCFs ) {
VCFCodec codec = new VCFCodec();
Pair<VCFHeader, List<VariantContext>> x = readAllVCs( file, codec );
List<VariantContext> fullyDecoded = new ArrayList<VariantContext>(x.getSecond().size());
int i = 0;
Pair<VCFHeader, Iterable<VariantContext>> x = readAllVCs( file, codec );
List<VariantContext> fullyDecoded = new ArrayList<VariantContext>();
logger.warn("Reading records from " + file);
for ( final VariantContext raw : x.getSecond() ) {
fullyDecoded.add(raw.fullyDecode(x.getFirst()));
logger.warn("\t" + i++);
}
logger.warn("Done reading " + file);
TEST_DATAs.add(new VariantContextTestData(x.getFirst(), x.getSecond()));
TEST_DATAs.add(new VariantContextTestData(x.getFirst(), fullyDecoded));
}
}
}
@ -326,6 +326,10 @@ public class VariantContextTestProvider {
final Genotype homVar = GenotypeBuilder.create("homVar", Arrays.asList(alt1, alt1));
addGenotypeTests(site, homRef, het);
addGenotypeTests(site, homRef, het, homVar);
// test no GT at all
addGenotypeTests(site);
final List<Allele> noCall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
// ploidy
@ -511,7 +515,7 @@ public class VariantContextTestProvider {
writer.add(vc);
writer.close();
final List<VariantContext> actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
final Iterable<VariantContext> actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
assertEquals(actual, expected);
}
@ -524,7 +528,7 @@ public class VariantContextTestProvider {
* @return
* @throws IOException
*/
private final static Pair<VCFHeader, List<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
private final static Pair<VCFHeader, Iterable<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
// read in the features
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
FeatureCodecHeader header = codec.readHeader(pbs);
@ -533,27 +537,79 @@ public class VariantContextTestProvider {
pbs = new PositionalBufferedStream(new FileInputStream(source));
pbs.skip(header.getHeaderEnd());
final List<VariantContext> read = new ArrayList<VariantContext>();
while ( ! pbs.isDone() ) {
final VariantContext vc = codec.decode(pbs);
if ( vc != null ) read.add(vc.fullyDecode((VCFHeader)header.getHeaderValue()));
};
final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue();
return new Pair<VCFHeader, Iterable<VariantContext>>(vcfHeader, new VCIterable(pbs, codec, vcfHeader));
}
return new Pair<VCFHeader, List<VariantContext>>((VCFHeader)header.getHeaderValue(), read);
private static class VCIterable implements Iterable<VariantContext>, Iterator<VariantContext> {
final PositionalBufferedStream pbs;
final FeatureCodec<VariantContext> codec;
final VCFHeader header;
private VCIterable(final PositionalBufferedStream pbs, final FeatureCodec<VariantContext> codec, final VCFHeader header) {
this.pbs = pbs;
this.codec = codec;
this.header = header;
}
@Override
public Iterator<VariantContext> iterator() {
return this;
}
@Override
public boolean hasNext() {
try {
return ! pbs.isDone();
} catch ( IOException e ) {
throw new RuntimeException(e);
}
}
@Override
public VariantContext next() {
try {
final VariantContext vc = codec.decode(pbs);
return vc == null ? null : vc.fullyDecode(header);
} catch ( IOException e ) {
throw new RuntimeException(e);
}
}
@Override
public void remove() {
//To change body of implemented methods use File | Settings | File Templates.
}
}
public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException {
final Pair<VCFHeader, List<VariantContext>> vcfData = readAllVCs(vcfFile, new VCFCodec());
final Pair<VCFHeader, List<VariantContext>> bcfData = readAllVCs(bcfFile, new BCF2Codec());
final Pair<VCFHeader, Iterable<VariantContext>> vcfData = readAllVCs(vcfFile, new VCFCodec());
final Pair<VCFHeader, Iterable<VariantContext>> bcfData = readAllVCs(bcfFile, new BCF2Codec());
assertEquals(bcfData.getFirst(), vcfData.getFirst());
assertEquals(bcfData.getSecond(), vcfData.getSecond());
}
public static void assertEquals(final List<VariantContext> actual, final List<VariantContext> expected) {
Assert.assertEquals(actual.size(), expected.size());
public static void assertEquals(final Iterable<VariantContext> actual, final Iterable<VariantContext> expected) {
final Iterator<VariantContext> actualIT = actual.iterator();
final Iterator<VariantContext> expectedIT = expected.iterator();
for ( int i = 0; i < expected.size(); i++ )
assertEquals(actual.get(i), expected.get(i));
while ( expectedIT.hasNext() ) {
final VariantContext expectedVC = expectedIT.next();
if ( expectedVC == null )
continue;
VariantContext actualVC;
do {
Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual");
actualVC = actualIT.next();
} while ( actualIT.hasNext() && actualVC == null );
if ( actualVC == null )
Assert.fail("Too few records in actual");
assertEquals(actualVC, expectedVC);
}
Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual");
}
/**
@ -730,4 +786,14 @@ public class VariantContextTestProvider {
Assert.assertEquals(actualLines.get(i), expectedLines.get(i));
}
}
public static void main( String argv[] ) {
final File variants1 = new File(argv[0]);
final File variants2 = new File(argv[1]);
try {
VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(variants1, variants2);
} catch ( IOException e ) {
throw new RuntimeException(e);
}
}
}