diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 13c737a2e..babbb7ab8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -347,6 +347,9 @@ public class GATKArgumentCollection { public boolean USE_SLOW_GENOTYPES = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed + @Argument(fullName="allowMissingVCFHeaders",shortName = "allowMissingVCFHeaders",doc="If provided, the GATK will write out VCF files that contain INFO, FILTER, and FORMAT fields not found in the VCF header",required=false) + public boolean allowMissingVCFHeaders = false; + /** * The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file * and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java index b97475000..d0fdae639 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java @@ -74,7 +74,8 @@ public class VariantContextWriterStorage implements Storage, Var List options = new ArrayList(); if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES); + if ( engine.getArguments().allowMissingVCFHeaders ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY); return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java index 07b04eb9c..c5ad435b3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java @@ -68,9 +68,10 @@ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements Standa } public static final String[] keyNames = {STR_PRESENT, REPEAT_UNIT_KEY,REPEATS_PER_ALLELE_KEY }; - public static final VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(STR_PRESENT, 1, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"), + public static final VCFInfoHeaderLine[] descriptions = { + new VCFInfoHeaderLine(STR_PRESENT, 0, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"), new VCFInfoHeaderLine(REPEAT_UNIT_KEY, 1, VCFHeaderLineType.String, "Tandem repeat unit (bases)"), - new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") }; + new VCFInfoHeaderLine(REPEATS_PER_ALLELE_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") }; public List getKeyNames() { return Arrays.asList(keyNames); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index f31957123..4e9e0afce 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -203,8 +203,9 @@ public class VariantAnnotatorEngine { // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { Map annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc); - if ( annotationsFromCurrentType != null ) + if ( annotationsFromCurrentType != null ) { infoAnnotations.putAll(annotationsFromCurrentType); + } } // generate a new annotated VC diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index a0f23531e..b9e768274 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -198,8 +198,9 @@ public class VariantEvalWalker extends RodWalker implements Tr // Variables private Set jexlExpressions = new TreeSet(); - private Set sampleNamesForEvaluation = new TreeSet(); - private Set sampleNamesForStratification = new TreeSet(); + private boolean isSubsettingSamples; + private Set sampleNamesForEvaluation = new LinkedHashSet(); + private Set sampleNamesForStratification = new LinkedHashSet(); // important stratifications private boolean byFilterIsEnabled = false; @@ -249,8 +250,10 @@ public class VariantEvalWalker extends RodWalker implements Tr Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - // Load the sample list - sampleNamesForEvaluation.addAll(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS)); + // Load the sample list, using an intermediate tree set to sort the samples + final Set allSampleNames = SampleUtils.getSamplesFromCommandLineInput(vcfSamples); + sampleNamesForEvaluation.addAll(new TreeSet(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS))); + isSubsettingSamples = ! sampleNamesForEvaluation.containsAll(allSampleNames); if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) { sampleNamesForStratification.addAll(sampleNamesForEvaluation); @@ -571,6 +574,7 @@ public class VariantEvalWalker extends RodWalker implements Tr public List> getEvals() { return evals; } + public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; } public Set getSampleNamesForEvaluation() { return sampleNamesForEvaluation; } public Set getSampleNamesForStratification() { return sampleNamesForStratification; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index fe21e158f..39033bfed 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -28,8 +28,6 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.StandardEval; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; @@ -37,13 +35,13 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.Require import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.StandardStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import java.lang.reflect.Field; import java.util.*; public class VariantEvalUtils { @@ -199,18 +197,32 @@ public class VariantEvalUtils { * @return a new VariantContext with just the requested samples */ public VariantContext getSubsetOfVariantContext(VariantContext vc, Set sampleNames) { - VariantContext vcsub = vc.subContextFromSamples(sampleNames, false); - VariantContextBuilder builder = new VariantContextBuilder(vcsub); + return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, false)); + } + public VariantContext ensureAnnotations(final VariantContext vc, final VariantContext vcsub) { final int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount(); final int newAlleleCount = vcsub.getHetCount() + 2 * vcsub.getHomVarCount(); + final boolean isSingleton = originalAlleleCount == newAlleleCount && newAlleleCount == 1; + final boolean hasChrCountAnnotations = vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && + vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) && + vc.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY); - if (originalAlleleCount == newAlleleCount && newAlleleCount == 1) { - builder.attribute(VariantEvalWalker.IS_SINGLETON_KEY, true); + if ( ! isSingleton && hasChrCountAnnotations ) { + // nothing to update + return vc; + } else { + // have to do the work + VariantContextBuilder builder = new VariantContextBuilder(vc); + + if ( isSingleton ) + builder.attribute(VariantEvalWalker.IS_SINGLETON_KEY, true); + + if ( ! hasChrCountAnnotations ) + VariantContextUtils.calculateChromosomeCounts(builder, true); + + return builder.make(); } - - VariantContextUtils.calculateChromosomeCounts(builder, true); - return builder.make(); } /** @@ -250,8 +262,11 @@ public class VariantEvalUtils { // First, filter the VariantContext to represent only the samples for evaluation VariantContext vcsub = vc; - if (subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation())) { - vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); + if (subsetBySample && vc.hasGenotypes()) { + if ( variantEvalWalker.isSubsettingToSpecificSamples() ) + vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); + else + vcsub = ensureAnnotations(vc, vc); } if ((byFilter || !vcsub.isFiltered())) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 3e84e9c52..b7388c4d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -427,12 +427,12 @@ public class SelectVariants extends RodWalker implements TreeR headerLines.add(new VCFHeaderLine("source", "SelectVariants")); if (KEEP_ORIGINAL_CHR_COUNTS) { - headerLines.add(new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC")); - headerLines.add(new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF")); - headerLines.add(new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN")); + headerLines.add(new VCFInfoHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC")); + headerLines.add(new VCFInfoHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF")); + headerLines.add(new VCFInfoHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN")); } headerLines.addAll(Arrays.asList(ChromosomeCounts.descriptions)); - headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Depth of coverage")); + headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Depth of coverage")); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) { diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 360a855fa..26c95bffd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -150,8 +150,7 @@ public class SampleUtils { // iterate to get all of the sample names for ( Map.Entry pair : VCFUtils.getVCFHeadersFromRods(toolkit).entrySet() ) { - Set vcfSamples = pair.getValue().getGenotypeSamples(); - for ( String sample : vcfSamples ) + for ( String sample : pair.getValue().getGenotypeSamples() ) addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey()); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 574677334..ab20299de 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -79,6 +79,14 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD */ private BCF2GenotypeFieldDecoders gtFieldDecoders = null; + /** + * A cached array of GenotypeBuilders for efficient genotype decoding. + * + * Caching it allows us to avoid recreating this intermediate data + * structure each time we decode genotypes + */ + private GenotypeBuilder[] builders = null; + // for error handling private int recordNo = 0; private int pos = 0; @@ -168,6 +176,13 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD // prepare the genotype field decoders gtFieldDecoders = new BCF2GenotypeFieldDecoders(header); + // create and initialize the genotype builder array + final int nSamples = header.getNGenotypeSamples(); + builders = new GenotypeBuilder[nSamples]; + for ( int i = 0; i < nSamples; i++ ) { + builders[i] = new GenotypeBuilder(header.getGenotypeSamples().get(i)); + } + // position right before next line (would be right before first real record byte at end of header) return new FeatureCodecHeader(header, inputStream.getPosition()); } @@ -256,6 +271,11 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD final int nFormatFields = nFormatSamples >> 24; final int nSamples = nFormatSamples & 0x00FFFFF; + if ( header.getNGenotypeSamples() != nSamples ) + throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " + + "different numbers of samples per record. Saw " + header.getNGenotypeSamples() + + " samples in header but have a record with " + nSamples + " samples"); + decodeID(builder); final ArrayList alleles = decodeAlleles(builder, pos, nAlleles); decodeFilter(builder); @@ -416,11 +436,11 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD final VariantContextBuilder builder ) { if (siteInfo.nSamples > 0) { final LazyGenotypesContext.LazyParser lazyParser = - new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields); - final int nGenotypes = header.getGenotypeSamples().size(); + new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders); + LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()), - nGenotypes); + header.getNGenotypeSamples()); // did we resort the sample names? If so, we need to load the genotype data if ( !header.samplesWereAlreadySorted() ) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java index e8e843546..0c737c9a2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -99,21 +99,21 @@ public class BCF2GenotypeFieldDecoders { */ public interface Decoder { @Requires({"siteAlleles != null", "! siteAlleles.isEmpty()", - "field != null", "decoder != null", "gbs != null", "! gbs.isEmpty()"}) + "field != null", "decoder != null", "gbs != null", "gbs.length != 0"}) public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, - final List gbs); + final GenotypeBuilder[] gbs); } private class GTDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { // we have to do a bit of low-level processing here as we want to know the size upfronta final int ploidy = decoder.decodeNumberOfElements(typeDescriptor); - if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.size() >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) + if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs); else { generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs); @@ -137,7 +137,7 @@ public class BCF2GenotypeFieldDecoders { private final void fastBiallelicDiploidDecode(final List siteAlleles, final BCF2Decoder decoder, final byte typeDescriptor, - final List gbs) { + final GenotypeBuilder[] gbs) { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); final int nPossibleGenotypes = 3 * 3; @@ -176,7 +176,7 @@ public class BCF2GenotypeFieldDecoders { final int ploidy, final BCF2Decoder decoder, final byte typeDescriptor, - final List gbs) { + final GenotypeBuilder[] gbs) { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); // a single cache for the encoded genotypes, since we don't actually need this vector @@ -213,7 +213,7 @@ public class BCF2GenotypeFieldDecoders { private class DPDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { // the -1 is for missing gb.DP(decoder.decodeInt(typeDescriptor, -1)); @@ -223,7 +223,7 @@ public class BCF2GenotypeFieldDecoders { private class GQDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { // the -1 is for missing gb.GQ(decoder.decodeInt(typeDescriptor, -1)); @@ -233,7 +233,7 @@ public class BCF2GenotypeFieldDecoders { private class ADDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { gb.AD(decoder.decodeIntArray(typeDescriptor)); } @@ -242,7 +242,7 @@ public class BCF2GenotypeFieldDecoders { private class PLDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { gb.PL(decoder.decodeIntArray(typeDescriptor)); } @@ -251,7 +251,7 @@ public class BCF2GenotypeFieldDecoders { private class GenericDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { Object value = decoder.decodeTypedValue(typeDescriptor); if ( value != null ) { // don't add missing values @@ -270,7 +270,7 @@ public class BCF2GenotypeFieldDecoders { private class FTDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { Object value = decoder.decodeTypedValue(typeDescriptor); if ( value != null ) { // don't add missing values diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index 9f5035086..7f10375bb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.codecs.bcf2; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -46,12 +47,16 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { private final ArrayList siteAlleles; private final int nSamples; private final int nFields; + private final GenotypeBuilder[] builders; - BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList alleles, final int nSamples, final int nFields) { + @Requires("codec.getHeader().getNGenotypeSamples() == builders.length") + BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList alleles, final int nSamples, + final int nFields, final GenotypeBuilder[] builders) { this.codec = codec; this.siteAlleles = alleles; this.nSamples = nSamples; this.nFields = nFields; + this.builders = builders; } @Override @@ -62,21 +67,8 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { // load our byte[] data into the decoder final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes); - // TODO -- fast path for sites only - - // go ahead and decode everyone - final List samples = new ArrayList(codec.getHeader().getGenotypeSamples()); - - if ( samples.size() != nSamples ) - throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " + - "different numbers of samples per record. Saw " + samples.size() + - " samples in header but have a record with " + nSamples + " samples"); - - // create and initialize the genotypes array - final ArrayList builders = new ArrayList(nSamples); - for ( int i = 0; i < nSamples; i++ ) { - builders.add(new GenotypeBuilder(samples.get(i))); - } + for ( int i = 0; i < nSamples; i++ ) + builders[i].reset(true); for ( int i = 0; i < nFields; i++ ) { // get the field name diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 37e2aa218..d8eebdfe2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -344,7 +344,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec // do we have genotyping data if (parts.length > NUM_STANDARD_FIELDS) { final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos); - final int nGenotypes = header.getGenotypeSamples().size(); + final int nGenotypes = header.getNGenotypeSamples(); LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes); // did we resort the sample names? If so, we need to load the genotype data diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 608a2026a..239748325 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import org.apache.log4j.Logger; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -35,6 +36,8 @@ import java.util.Map; * a base class for compound header lines, which include info lines and format lines (so far) */ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { + final protected static Logger logger = Logger.getLogger(VCFHeader.class); + public enum SupportedHeaderLineType { INFO(true), FORMAT(false); @@ -172,6 +175,11 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF if ( name == null || type == null || description == null || lineType == null ) throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", super.getKey(), name, type, description, lineType )); + + if ( type == VCFHeaderLineType.Flag && count != 0 ) { + count = 0; + logger.warn("FLAG fields must have a count value of 0, but saw " + count + " for header line " + getID() + ". Changing it to 0 inside the code"); + } } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 0f277a5b0..0a800d7d0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.apache.log4j.Logger; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -54,11 +55,12 @@ public class VCFHeader { private final Set mMetaData = new TreeSet(); private final Map mInfoMetaData = new HashMap(); private final Map mFormatMetaData = new HashMap(); + private final Map mFilterMetaData = new HashMap(); private final Map mOtherMetaData = new HashMap(); private final List contigMetaData = new ArrayList(); // the list of auxillary tags - private final Set mGenotypeSampleNames = new LinkedHashSet(); + private final List mGenotypeSampleNames = new ArrayList(); // the character string that indicates meta data public static final String METADATA_INDICATOR = "##"; @@ -106,7 +108,15 @@ public class VCFHeader { * @param genotypeSampleNames the sample names */ public VCFHeader(Set metaData, Set genotypeSampleNames) { + this(metaData, new ArrayList(genotypeSampleNames)); + } + + public VCFHeader(Set metaData, List genotypeSampleNames) { this(metaData); + + if ( genotypeSampleNames.size() != new HashSet(genotypeSampleNames).size() ) + throw new ReviewedStingException("BUG: VCF header has duplicate sample names"); + mGenotypeSampleNames.addAll(genotypeSampleNames); samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames); buildVCFReaderMaps(genotypeSampleNames); @@ -175,12 +185,23 @@ public class VCFHeader { } else if ( line instanceof VCFFormatHeaderLine ) { VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; addMetaDataMapBinding(mFormatMetaData, formatLine); + } else if ( line instanceof VCFFilterHeaderLine ) { + VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line; + mFilterMetaData.put(filterLine.getID(), filterLine); } else if ( line instanceof VCFContigHeaderLine ) { contigMetaData.add((VCFContigHeaderLine)line); } else { mOtherMetaData.put(line.getKey(), line); } } + + if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) { + logger.warn("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " + + VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY + " field. As the GATK now only manages PL fields internally" + + " automatically adding a corresponding PL field to your VCF header"); + addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + loadMetaDataMaps(); + } } /** @@ -239,7 +260,7 @@ public class VCFHeader { * * @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false */ - public Set getGenotypeSamples() { + public List getGenotypeSamples() { return mGenotypeSampleNames; } @@ -294,6 +315,26 @@ public class VCFHeader { return mFormatMetaData.get(id); } + /** + * @param id the header key name + * @return the meta data line, or null if there is none + */ + public VCFFilterHeaderLine getFilterHeaderLine(final String id) { + return mFilterMetaData.get(id); + } + + public boolean hasInfoLine(final String id) { + return getInfoHeaderLine(id) != null; + } + + public boolean hasFormatLine(final String id) { + return getFormatHeaderLine(id) != null; + } + + public boolean hasFilterLine(final String id) { + return getFilterHeaderLine(id) != null; + } + /** * @param key the header key name * @return the meta data line, or null if there is none diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index 15b1b386a..160f96056 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext; import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -49,6 +50,7 @@ import java.util.*; * @author Mark DePristo * @since 06/12 */ +@Invariant({"alleles != null"}) public final class GenotypeBuilder { public static boolean MAKE_FAST_BY_DEFAULT = true; @@ -154,9 +156,9 @@ public final class GenotypeBuilder { * function you must provide sampleName and alleles before trying to * make more Genotypes. */ - public final void reset() { - sampleName = null; - alleles = null; + public final void reset(final boolean keepSampleName) { + if ( ! keepSampleName ) sampleName = null; + alleles = Collections.emptyList(); isPhased = false; GQ = -1; DP = -1; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index a51f2189d..9276046dc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -119,10 +119,6 @@ public class VariantContextUtils { attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); } - else { - attributes.put(VCFConstants.ALLELE_COUNT_KEY, 0); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0); - } } return attributes; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java index b466038dd..2ff32964f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java @@ -260,7 +260,7 @@ public abstract class BCF2FieldEncoder { @Requires("isDynamicallyTyped()") @Ensures("result != null") public BCF2Type getDynamicType(final Object value) { - throw new ReviewedStingException("BUG: cannot get dynamic type for statically typed BCF2 field"); + throw new ReviewedStingException("BUG: cannot get dynamic type for statically typed BCF2 field " + getField()); } // ---------------------------------------------------------------------- @@ -367,7 +367,7 @@ public abstract class BCF2FieldEncoder { public Flag(final VCFCompoundHeaderLine headerLine, final Map dict ) { super(headerLine, dict, BCF2Type.INT8); if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 ) - throw new ReviewedStingException("Flag encoder only suppports atomic flags!"); + throw new ReviewedStingException("Flag encoder only suppports atomic flags for field " + getField()); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/Options.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/Options.java index 7180ae6bc..3a5cb23b0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/Options.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/Options.java @@ -33,5 +33,6 @@ package org.broadinstitute.sting.utils.variantcontext.writer; public enum Options { INDEX_ON_THE_FLY, DO_NOT_WRITE_GENOTYPES, + ALLOW_MISSING_FIELDS_IN_HEADER, FORCE_BCF } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 2679c51b1..69649aca7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -27,9 +27,9 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; @@ -54,12 +54,17 @@ class VCFWriter extends IndexingVariantContextWriter { // were filters applied? protected boolean filtersWereAppliedToContext = false; + final private boolean allowMissingFieldsInHeader; + private IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors(); - public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, + final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes, + final boolean allowMissingFieldsInHeader ) { super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size this.doNotWriteGenotypes = doNotWriteGenotypes; + this.allowMissingFieldsInHeader = allowMissingFieldsInHeader; } // -------------------------------------------------------------------------------- @@ -222,6 +227,10 @@ class VCFWriter extends IndexingVariantContextWriter { Map infoFields = new TreeMap(); for ( Map.Entry field : vc.getAttributes().entrySet() ) { String key = field.getKey(); + + if ( ! mHeader.hasInfoLine(key) ) + fieldIsMissingFromHeaderError(vc, key, "INFO"); + String outputValue = formatVCFField(field.getValue()); if ( outputValue != null ) infoFields.put(key, outputValue); @@ -236,6 +245,10 @@ class VCFWriter extends IndexingVariantContextWriter { } else { List genotypeAttributeKeys = calcVCFGenotypeKeys(vc, mHeader); if ( ! genotypeAttributeKeys.isEmpty() ) { + for ( final String format : genotypeAttributeKeys ) + if ( ! mHeader.hasFormatLine(format) ) + fieldIsMissingFromHeaderError(vc, format, "FORMAT"); + final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); mWriter.write(VCFConstants.FIELD_SEPARATOR); @@ -270,12 +283,18 @@ class VCFWriter extends IndexingVariantContextWriter { // // -------------------------------------------------------------------------------- - public static final String getFilterString(final VariantContext vc) { - return getFilterString(vc, false); - } + private final String getFilterString(final VariantContext vc, boolean forcePASS) { + if ( vc.isFiltered() ) { + for ( final String filter : vc.getFilters() ) + if ( ! mHeader.hasFilterLine(filter) ) + fieldIsMissingFromHeaderError(vc, filter, "FILTER"); - public static final String getFilterString(final VariantContext vc, boolean forcePASS) { - return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); + return ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())); + } + else if ( forcePASS || vc.filtersWereApplied() ) + return VCFConstants.PASSES_FILTERS_v4; + else + return VCFConstants.UNFILTERED; } private static final String QUAL_FORMAT_STRING = "%.2f"; @@ -330,13 +349,13 @@ class VCFWriter extends IndexingVariantContextWriter { */ private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) throws IOException { - if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) { - final List badSampleNames = new ArrayList(); - for ( final Genotype g : vc.getGenotypes() ) - if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) ) - badSampleNames.add(g.getSampleName()); - throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames)); - } +// if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) { +// final List badSampleNames = new ArrayList(); +// for ( final Genotype g : vc.getGenotypes() ) +// if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) ) +// badSampleNames.add(g.getSampleName()); +// throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames)); +// } for ( String sample : mHeader.getGenotypeSamples() ) { mWriter.write(VCFConstants.FIELD_SEPARATOR); @@ -553,4 +572,13 @@ class VCFWriter extends IndexingVariantContextWriter { } return count; } + + private final void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) { + if ( !allowMissingFieldsInHeader) + throw new UserException.MalformedVCFHeader("Key " + id + " found in VariantContext field " + field + + " at " + vc.getChr() + ":" + vc.getStart() + + " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have" + + " complete VCF headers by default. This error can be disabled with the engine argument" + + " --allowMissingVCFHeaders"); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWriterFactory.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWriterFactory.java index db3d4f343..f23166a02 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWriterFactory.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWriterFactory.java @@ -79,7 +79,8 @@ public class VariantContextWriterFactory { else { return new VCFWriter(location, output, refDict, options.contains(Options.INDEX_ON_THE_FLY), - options.contains(Options.DO_NOT_WRITE_GENOTYPES)); + options.contains(Options.DO_NOT_WRITE_GENOTYPES), + options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER)); } } diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 912ced2be..94b3b54ba 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -277,7 +277,7 @@ public abstract class BaseTest { Reporter.log(message, true); } - private static final double DEFAULT_FLOAT_TOLERANCE = 1e-4; + private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1; public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) { Assert.assertTrue(actual instanceof Double); diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 9871f637c..3b252528e 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -51,7 +51,7 @@ import java.text.SimpleDateFormat; import java.util.*; public class WalkerTest extends BaseTest { - private static final boolean GENERATE_SHADOW_BCF = false; + private static final boolean GENERATE_SHADOW_BCF = true; private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false; private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 4e7131c7e..1acf33f0b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; @@ -15,9 +16,11 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = testDir + "NA12878.hg19.example1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header", + "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + + b37hapmapGenotypes + " -disc " + testFile + + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders --allowMissingVCFHeaders", 1, - Arrays.asList("133fd0ded0bb213097cbe68995afbb7e") + Arrays.asList("18ca4d3e69874c5e55377ca2ad7c6014") ); spec.disableShadowBCF(); @@ -31,7 +34,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -sn B -sn C --variant " + testfile), 1, - Arrays.asList("b2ee12588ebda200727762a903b8c972") + Arrays.asList("ddb082f5f15944b7545b45c20440a474") ); executeTest("testRepeatedLineSelection--" + testfile, spec); @@ -42,9 +45,11 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = testDir + "NA12878.hg19.example1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header", + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + + b37hapmapGenotypes + " -disc " + testFile + + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders", 1, - Arrays.asList("f64c90c4cca470f1095d9fa2062eac3e") + Arrays.asList("2e0eaed94e2ab14349b967dc89a09823") ); spec.disableShadowBCF(); @@ -57,9 +62,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), - 1, - Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4") + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), + 1, + Arrays.asList("88c142737080847e44dc8e8c982cfc74") ); spec.disableShadowBCF(); executeTest("testComplexSelection--" + testfile, spec); @@ -71,9 +76,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, - 1, - Arrays.asList("b24f31db48d254d8fe15295955173486") + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, + 1, + Arrays.asList("338bfc635667793b89c349446982a36d") ); spec.disableShadowBCF(); @@ -86,9 +91,11 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = testDir + "NA12878.hg19.example1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + b37hapmapGenotypes + " --variant " + testFile + " -o %s --no_cmdline_in_header", + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + + b37hapmapGenotypes + " --variant " + testFile + + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders", 1, - Arrays.asList("9da5dab3d344c1c0a5987b15e60fa082") + Arrays.asList("7d2c7deeacf307d140a85a34629c6539") ); spec.disableShadowBCF(); @@ -102,7 +109,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("30b89b3a6706f7f46b23bfb3be69cc8e") + Arrays.asList("f0509f4c2c6c9eff3bec8171f00762b3") ); executeTest("testVariantTypeSelection--" + testFile, spec); @@ -115,7 +122,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("8bf557aaa07eccb294c81f491225bf9e") + Arrays.asList("9162a67ccb4201c0542f30d14967f2d5") ); executeTest("testUsingDbsnpName--" + testFile, spec); @@ -141,7 +148,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -select 'KG_FREQ < 0.5' --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("cb9932f9a7aa2e53af605b30d88ad43f") + Arrays.asList("7dabe6910a40a9db5006c6af66b4617c") ); executeTest("testMultipleRecordsAtOnePosition--" + testFile, spec); @@ -154,7 +161,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("920605cc2182026e3f54c009f6a04141") + Arrays.asList("034965918283d5597bb443e65cd20cf8") ); executeTest("testNoGTs--" + testFile, spec); @@ -167,9 +174,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec; spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"), - 1, - Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4") + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"), + 1, + Arrays.asList("88c142737080847e44dc8e8c982cfc74") ); spec.disableShadowBCF(); executeTest("testParallelization (2 threads)--" + testfile, spec); @@ -177,13 +184,13 @@ public class SelectVariantsIntegrationTest extends WalkerTest { @Test(enabled = false) public void testParallelization4() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - WalkerTestSpec spec; - spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"), - 1, - Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4") + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + WalkerTestSpec spec; + spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"), + 1, + Arrays.asList("88c142737080847e44dc8e8c982cfc74") ); spec.disableShadowBCF(); @@ -197,8 +204,32 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, 1, - Arrays.asList("2f2a342812ba914bcce666e42ef761d7") + Arrays.asList("3cbabcd256070ea113ddc04fef3b6900") ); executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); } + + @Test() + public void testFileWithoutInfoLineInHeader() { + testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeader", UserException.class); + } + + @Test() + public void testFileWithoutInfoLineInHeaderWithOverride() { + testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeaderWithOverride", null); + } + + private void testFileWithoutInfoLineInHeader(final String name, final Class expectedException) { + final String testFile = testDir + "missingHeaderLine.vcf"; + final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + + testFile + " -o %s --no_cmdline_in_header" + + (expectedException == null ? " -allowMissingVCFHeaders" : ""); + WalkerTestSpec spec = + expectedException != null + ? new WalkerTestSpec(cmd, 1, expectedException) + : new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + + executeTest(name, spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 1e9808680..b47895917 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -142,16 +142,16 @@ public class VariantContextTestProvider { if ( ENABLE_SOURCE_VCF_TESTS ) { for ( final File file : testSourceVCFs ) { VCFCodec codec = new VCFCodec(); - Pair> x = readAllVCs( file, codec ); - List fullyDecoded = new ArrayList(x.getSecond().size()); - int i = 0; + Pair> x = readAllVCs( file, codec ); + List fullyDecoded = new ArrayList(); + logger.warn("Reading records from " + file); for ( final VariantContext raw : x.getSecond() ) { fullyDecoded.add(raw.fullyDecode(x.getFirst())); - logger.warn("\t" + i++); } logger.warn("Done reading " + file); - TEST_DATAs.add(new VariantContextTestData(x.getFirst(), x.getSecond())); + + TEST_DATAs.add(new VariantContextTestData(x.getFirst(), fullyDecoded)); } } } @@ -326,6 +326,10 @@ public class VariantContextTestProvider { final Genotype homVar = GenotypeBuilder.create("homVar", Arrays.asList(alt1, alt1)); addGenotypeTests(site, homRef, het); addGenotypeTests(site, homRef, het, homVar); + + // test no GT at all + addGenotypeTests(site); + final List noCall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); // ploidy @@ -511,7 +515,7 @@ public class VariantContextTestProvider { writer.add(vc); writer.close(); - final List actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond(); + final Iterable actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond(); assertEquals(actual, expected); } @@ -524,7 +528,7 @@ public class VariantContextTestProvider { * @return * @throws IOException */ - private final static Pair> readAllVCs( final File source, final FeatureCodec codec ) throws IOException { + private final static Pair> readAllVCs( final File source, final FeatureCodec codec ) throws IOException { // read in the features PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); FeatureCodecHeader header = codec.readHeader(pbs); @@ -533,27 +537,79 @@ public class VariantContextTestProvider { pbs = new PositionalBufferedStream(new FileInputStream(source)); pbs.skip(header.getHeaderEnd()); - final List read = new ArrayList(); - while ( ! pbs.isDone() ) { - final VariantContext vc = codec.decode(pbs); - if ( vc != null ) read.add(vc.fullyDecode((VCFHeader)header.getHeaderValue())); - }; + final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); + return new Pair>(vcfHeader, new VCIterable(pbs, codec, vcfHeader)); + } - return new Pair>((VCFHeader)header.getHeaderValue(), read); + private static class VCIterable implements Iterable, Iterator { + final PositionalBufferedStream pbs; + final FeatureCodec codec; + final VCFHeader header; + + private VCIterable(final PositionalBufferedStream pbs, final FeatureCodec codec, final VCFHeader header) { + this.pbs = pbs; + this.codec = codec; + this.header = header; + } + + @Override + public Iterator iterator() { + return this; + } + + @Override + public boolean hasNext() { + try { + return ! pbs.isDone(); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + @Override + public VariantContext next() { + try { + final VariantContext vc = codec.decode(pbs); + return vc == null ? null : vc.fullyDecode(header); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + @Override + public void remove() { + //To change body of implemented methods use File | Settings | File Templates. + } } public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { - final Pair> vcfData = readAllVCs(vcfFile, new VCFCodec()); - final Pair> bcfData = readAllVCs(bcfFile, new BCF2Codec()); + final Pair> vcfData = readAllVCs(vcfFile, new VCFCodec()); + final Pair> bcfData = readAllVCs(bcfFile, new BCF2Codec()); assertEquals(bcfData.getFirst(), vcfData.getFirst()); assertEquals(bcfData.getSecond(), vcfData.getSecond()); } - public static void assertEquals(final List actual, final List expected) { - Assert.assertEquals(actual.size(), expected.size()); + public static void assertEquals(final Iterable actual, final Iterable expected) { + final Iterator actualIT = actual.iterator(); + final Iterator expectedIT = expected.iterator(); - for ( int i = 0; i < expected.size(); i++ ) - assertEquals(actual.get(i), expected.get(i)); + while ( expectedIT.hasNext() ) { + final VariantContext expectedVC = expectedIT.next(); + if ( expectedVC == null ) + continue; + + VariantContext actualVC; + do { + Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); + actualVC = actualIT.next(); + } while ( actualIT.hasNext() && actualVC == null ); + + if ( actualVC == null ) + Assert.fail("Too few records in actual"); + + assertEquals(actualVC, expectedVC); + } + Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); } /** @@ -730,4 +786,14 @@ public class VariantContextTestProvider { Assert.assertEquals(actualLines.get(i), expectedLines.get(i)); } } + + public static void main( String argv[] ) { + final File variants1 = new File(argv[0]); + final File variants2 = new File(argv[1]); + try { + VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(variants1, variants2); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } } \ No newline at end of file