From 43ad890fcc17f5d956882eebb29716b57ace6c6d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 7 Jun 2012 08:34:25 -0400 Subject: [PATCH] Finalizing BCF2 v2 -- FastGenotypes are the default in the engine. Use --useSlowGenotypes engine argument to return to old representation -- Cleanup of BCF2Codec. Good error handling. Added contracts and docs. -- Added a few more contacts and docs to BCF2Decoder -- Optimized encodePrimitive in BCF2Encoder -- Removed genotype filter field exceptions -- Docs and cleanup of BCF2GenotypeFieldDecoders -- Deleted unused BCF2TestWalker -- Docs and cleanup of BCF2Types -- Faster version of decodeInts in VCFCodec -- BCF2Writer -- Support for writing a sites only file -- Lots of TODOs for future optimizations -- Removed lack of filter field support -- No longer uses the alleleMap from VCFWriter, which was a Allele -> String, now uses Allele -> Integer which is faster and more natural -- Lots of docs and contracts -- Docs for GenotypeBuilder. More filter creation routines (unfiltered, for example) -- More extensive tests in VariantContextTestProfiler, including variable length strings in genotypes and genotype filters. Better genotype comparisons --- .../sting/gatk/GenomeAnalysisEngine.java | 4 +- .../arguments/GATKArgumentCollection.java | 4 +- .../sting/utils/codecs/bcf2/BCF2Codec.java | 204 ++++++++++++++---- .../sting/utils/codecs/bcf2/BCF2Decoder.java | 9 +- .../sting/utils/codecs/bcf2/BCF2Encoder.java | 30 ++- .../bcf2/BCF2GenotypeFieldDecoders.java | 37 ++-- .../utils/codecs/bcf2/BCF2TestWalker.java | 143 ------------ .../sting/utils/codecs/bcf2/BCF2Type.java | 50 ++++- .../sting/utils/codecs/bcf2/BCF2Utils.java | 6 - .../sting/utils/codecs/vcf/VCFCodec.java | 9 +- .../utils/variantcontext/GenotypeBuilder.java | 108 +++++++++- .../variantcontext/GenotypeLikelihoods.java | 1 + .../variantcontext/writer/BCF2Writer.java | 117 ++++++---- .../variantcontext/writer/VCFWriter.java | 2 +- .../VariantContextTestProvider.java | 78 ++++++- 15 files changed, 508 insertions(+), 294 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2TestWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b4532500b..d77f266bd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -223,8 +223,8 @@ public class GenomeAnalysisEngine { resetRandomGenerator(System.currentTimeMillis()); // TODO -- REMOVE ME WHEN WE STOP BCF testing - if ( this.getArguments().USE_FAST_GENOTYPES ) - GenotypeBuilder.MAKE_FAST_BY_DEFAULT = true; + if ( this.getArguments().USE_SLOW_GENOTYPES ) + GenotypeBuilder.MAKE_FAST_BY_DEFAULT = false; // if the use specified an input BQSR recalibration table then enable on the fly recalibration if (this.getArguments().BQSR_RECAL_FILE != null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 2e06748f3..bd830a920 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -336,9 +336,9 @@ public class GATKArgumentCollection { public boolean generateShadowBCF = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - @Argument(fullName="useFastGenotypes",shortName = "useFastGenotypes",doc="",required=false) + @Argument(fullName="useSlowGenotypes",shortName = "useSlowGenotypes",doc="",required=false) @Hidden - public boolean USE_FAST_GENOTYPES = false; + public boolean USE_SLOW_GENOTYPES = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed /** diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 5915c9914..122fc1dc9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.codecs.bcf2; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; @@ -43,16 +45,45 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; -public class BCF2Codec implements FeatureCodec, ReferenceDependentFeatureCodec { +/** + * Decode BCF2 files + */ +public final class BCF2Codec implements FeatureCodec, ReferenceDependentFeatureCodec { final protected static Logger logger = Logger.getLogger(BCF2Codec.class); private VCFHeader header = null; + + /** + * Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field + */ private final ArrayList contigNames = new ArrayList(); + + /** + * Maps header string names (encoded in VCF) into strings found in the BCF header + * + * Initialized when processing the header + */ private ArrayList dictionary; + + /** + * Our decoder that reads low-level objects from the BCF2 records + */ private final BCF2Decoder decoder = new BCF2Decoder(); - private boolean skipGenotypes = false; + + /** + * Provides some sanity checking on the header + */ private final static int MAX_HEADER_SIZE = 0x08000000; + + /** + * Genotype field decoders that are initialized when the header is read + */ private BCF2GenotypeFieldDecoders gtFieldDecoders = null; + // for error handling + private int recordNo = 0; + private int pos = 0; + + // ---------------------------------------------------------------------- // // Feature codec interface functions @@ -61,28 +92,30 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende @Override public Feature decodeLoc( final PositionalBufferedStream inputStream ) { - return decode(inputStream); - // TODO: a less expensive version of decodeLoc() that doesn't use VariantContext - // TODO: very easy -- just decodeSitesBlock, and then skip to end of end of sites block - // TODO: and then skip genotypes block + recordNo++; + final VariantContextBuilder builder = new VariantContextBuilder(); + + final int sitesBlockSize = decoder.readBlockSize(inputStream); + final int genotypeBlockSize = decoder.readBlockSize(inputStream); // necessary because it's in the stream + decoder.readNextBlock(sitesBlockSize, inputStream); + decodeSiteLoc(builder); + + return builder.fullyDecoded(true).make(); } @Override public VariantContext decode( final PositionalBufferedStream inputStream ) { + recordNo++; final VariantContextBuilder builder = new VariantContextBuilder(); final int sitesBlockSize = decoder.readBlockSize(inputStream); final int genotypeBlockSize = decoder.readBlockSize(inputStream); decoder.readNextBlock(sitesBlockSize, inputStream); - final SitesInfoForDecoding info = decodeSitesBlock(builder); - - if ( isSkippingGenotypes() ) { - decoder.skipNextBlock(genotypeBlockSize, inputStream); - } else { - decoder.readNextBlock(genotypeBlockSize, inputStream); - createLazyGenotypesDecoder(info, builder); - } + decodeSiteLoc(builder); + final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder); + decoder.readNextBlock(genotypeBlockSize, inputStream); + createLazyGenotypesDecoder(info, builder); return builder.fullyDecoded(true).make(); } @@ -96,16 +129,16 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende try { // note that this reads the magic as well, and so does double duty if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) ) - throw new UserException.MalformedBCF2("Input stream does not begin with BCF2 magic"); + error("Input stream does not begin with BCF2 magic"); final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream); if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB - throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE); + error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE); final byte[] headerBytes = new byte[headerSizeInBytes]; if ( inputStream.read(headerBytes) != headerSizeInBytes ) - throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes); + error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes); final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes)); final AsciiLineReader headerReader = new AsciiLineReader(bps); @@ -120,8 +153,11 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende if ( ! header.getContigLines().isEmpty() ) { logger.info("Found contig lines in BCF2 file, using those"); contigNames.clear(); - for ( final VCFContigHeaderLine contig : header.getContigLines()) + for ( final VCFContigHeaderLine contig : header.getContigLines()) { + if ( contig.getID() == null || contig.getID().equals("") ) + error("found a contig with an invalid ID " + contig); contigNames.add(contig.getID()); + } } else { logger.info("Didn't find any contig lines in BCF2 file, falling back (dangerously) to GATK reference dictionary"); } @@ -161,7 +197,6 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende // // -------------------------------------------------------------------------------- - @Override public void setGenomeLocParser(final GenomeLocParser genomeLocParser) { // initialize contigNames to standard ones in reference @@ -169,14 +204,6 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende contigNames.add(contig.getSequenceName()); } - public boolean isSkippingGenotypes() { - return skipGenotypes; - } - - public void setSkipGenotypes(final boolean skipGenotypes) { - this.skipGenotypes = skipGenotypes; - } - // -------------------------------------------------------------------------------- // // implicit block @@ -190,16 +217,33 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende // // -------------------------------------------------------------------------------- - private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) { + /** + * Decode the sites level data from this classes decoder + * + * @param builder + * @return + */ + @Requires({"builder != null"}) + private final void decodeSiteLoc(final VariantContextBuilder builder) { final int contigOffset = decoder.decodeInt(BCF2Type.INT32); final String contig = lookupContigName(contigOffset); builder.chr(contig); - final int pos = decoder.decodeInt(BCF2Type.INT32); + this.pos = decoder.decodeInt(BCF2Type.INT32); final int refLength = decoder.decodeInt(BCF2Type.INT32); builder.start((long)pos); builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals + } + /** + * Decode the sites level data from this classes decoder + * + * @param builder + * @return + */ + @Requires({"builder != null", "decoder != null"}) + @Ensures({"result != null", "result.isValid()"}) + private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) { final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT); if ( qual != null ) { builder.log10PError(((Double)qual) / -10.0); @@ -217,23 +261,39 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende decodeFilter(builder); decodeInfo(builder, nInfo); - return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles); + final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles); + if ( ! info.isValid() ) + error("Sites info is malformed: " + info); + return info; } protected final static class SitesInfoForDecoding { - final int pos; final int nFormatFields; final int nSamples; final ArrayList alleles; - private SitesInfoForDecoding(final int pos, final int nFormatFields, final int nSamples, final ArrayList alleles) { - this.pos = pos; + private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final ArrayList alleles) { this.nFormatFields = nFormatFields; this.nSamples = nSamples; this.alleles = alleles; } + + public boolean isValid() { + return nFormatFields >= 0 && + nSamples >= 0 && + alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference(); + } + + @Override + public String toString() { + return String.format("nFormatFields = %d, nSamples = %d, alleles = %s", nFormatFields, nSamples, alleles); + } } + /** + * Decode the id field in this BCF2 file and store it in the builder + * @param builder + */ private void decodeID( final VariantContextBuilder builder ) { final String id = (String)decoder.decodeTypedValue(); @@ -243,6 +303,15 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende builder.id(id); } + /** + * Annoying routine that deals with allele clipping from the BCF2 encoding to the standard + * GATK encoding. + * + * @param position + * @param ref + * @param unclippedAlleles + * @return + */ protected static ArrayList clipAllelesIfNecessary(int position, String ref, ArrayList unclippedAlleles) { if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) { ArrayList clippedAlleles = new ArrayList(unclippedAlleles.size()); @@ -252,6 +321,14 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende return unclippedAlleles; } + /** + * Decode the alleles from this BCF2 file and put the results in builder + * @param builder + * @param pos + * @param nAlleles + * @return the alleles + */ + @Requires("nAlleles > 0") private ArrayList decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) { // TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes ArrayList alleles = new ArrayList(nAlleles); @@ -267,15 +344,21 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende alleles.add(Allele.create(allele, false)); } } + assert ref != null; alleles = clipAllelesIfNecessary(pos, ref, alleles); builder.alleles(alleles); + assert ref.length() > 0; builder.referenceBaseForIndel(ref.getBytes()[0]); return alleles; } + /** + * Decode the filter field of this BCF2 file and store the result in the builder + * @param builder + */ private void decodeFilter( final VariantContextBuilder builder ) { final Object value = decoder.decodeTypedValue(); @@ -283,17 +366,28 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende builder.unfiltered(); else { if ( value instanceof Integer ) + // fast path for single integer result builder.filter(getDictionaryString((Integer)value)); else { - for ( int offset : (List)value ) + for ( final int offset : (List)value ) builder.filter(getDictionaryString(offset)); } } } + /** + * Loop over the info field key / value pairs in this BCF2 file and decode them into the builder + * + * @param builder + * @param numInfoFields + */ + @Requires("numInfoFields >= 0") private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) { - final Map infoFieldEntries = new HashMap(numInfoFields); + if ( numInfoFields == 0 ) + // fast path, don't bother doing any work if there are no fields + return; + final Map infoFieldEntries = new HashMap(numInfoFields); for ( int i = 0; i < numInfoFields; i++ ) { final String key = getDictionaryString(); Object value = decoder.decodeTypedValue(); @@ -340,47 +434,63 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende final public int nGenotypeFields; final public byte[] bytes; + @Requires({"nGenotypeField > 0", "bytes != null"}) public LazyData(final int nGenotypeFields, final byte[] bytes) { this.nGenotypeFields = nGenotypeFields; this.bytes = bytes; } } + @Ensures("result != null") private final String getDictionaryString() { return getDictionaryString((Integer) decoder.decodeTypedValue()); } + @Requires("offset >= dictionary.size()") + @Ensures("result != null") protected final String getDictionaryString(final int offset) { - if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset); - final String field = dictionary.get(offset); - return field; + return dictionary.get(offset); } + /** + * Translate the config offset as encoded in the BCF file into the actual string + * name of the contig from the dictionary + * + * @param contigOffset + * @return + */ + @Requires({"contigOffset >= 0", "contigOffset < contigNames.size()"}) + @Ensures("result != null") private final String lookupContigName( final int contigOffset ) { - if ( contigOffset < contigNames.size() ) { - return contigNames.get(contigOffset); - } - else { - throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames)); - } + return contigNames.get(contigOffset); } - + @Requires("header != null") + @Ensures({"result != null", "! result.isEmpty()"}) private final ArrayList parseDictionary(final VCFHeader header) { final ArrayList dict = BCF2Utils.makeDictionary(header); // if we got here we never found a dictionary, or there are no elements in the dictionary - if ( dict.size() == 0 ) - throw new UserException.MalformedBCF2("Dictionary header element was absent or empty"); + if ( dict.isEmpty() ) + error("Dictionary header element was absent or empty"); return dict; } + /** + * @return the VCFHeader we found in this BCF2 file + */ protected VCFHeader getHeader() { return header; } + @Requires("field != null") + @Ensures("result != null") protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) { return gtFieldDecoders.getDecoder(field); } + + private final void error(final String message) throws RuntimeException { + throw new UserException.MalformedBCF2(String.format("At record %d with position %d:", recordNo, pos, message)); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index b8d30addd..127390df3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -40,8 +40,8 @@ import java.util.Arrays; public final class BCF2Decoder { final protected static Logger logger = Logger.getLogger(FeatureCodec.class); - byte[] recordBytes; - ByteArrayInputStream recordStream; + byte[] recordBytes = null; + ByteArrayInputStream recordStream = null; public BCF2Decoder() { // nothing to do @@ -69,6 +69,7 @@ public final class BCF2Decoder { * @return */ public void readNextBlock(final int blockSizeInBytes, final InputStream stream) { + if ( blockSizeInBytes < 0 ) throw new UserException.MalformedBCF2("Invalid block size " + blockSizeInBytes); setRecordBytes(readRecordBytes(blockSizeInBytes, stream)); } @@ -115,9 +116,9 @@ public final class BCF2Decoder { * * @param recordBytes */ + @Requires("recordBytes != null") + @Ensures({"this.recordBytes == recordBytes", "recordStream != null"}) public void setRecordBytes(final byte[] recordBytes) { - assert recordBytes != null; - this.recordBytes = recordBytes; this.recordStream = new ByteArrayInputStream(recordBytes); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java index 2f8a99715..cea2d9491 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Encoder.java @@ -32,7 +32,7 @@ import java.io.OutputStream; import java.util.*; /** - * Simple BCF2 encoder + * BCF2 encoder * * @author depristo * @since 5/12 @@ -255,12 +255,30 @@ public final class BCF2Encoder { } public final static void encodePrimitive(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException { - for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) { - final int shift = i * 8; - int mask = 0xFF << shift; - int byteValue = (mask & value) >> shift; - encodeStream.write(byteValue); + switch ( type.getSizeInBytes() ) { + case 1: + encodeStream.write(0xFF & value); + break; + case 2: + encodeStream.write((0xFF00 & value) >> 8); + encodeStream.write(0xFF & value); + break; + case 4: + encodeStream.write((0xFF000000 & value) >> 24); + encodeStream.write((0x00FF0000 & value) >> 16); + encodeStream.write((0x0000FF00 & value) >> 8); + encodeStream.write((0x000000FF & value)); + break; + default: + throw new ReviewedStingException("BUG: unexpected type size " + type); } +// general case for reference +// for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) { +// final int shift = i * 8; +// int mask = 0xFF << shift; +// int byteValue = (mask & value) >> shift; +// encodeStream.write(byteValue); +// } } private final List stringToBytes(final String v) throws IOException { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java index 8c328d0dc..2f6dfe175 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -29,17 +29,22 @@ import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; /** - * An efficient + * An efficient scheme for building and obtaining specialized + * genotype field decoders. Used by the BCFCodec to parse + * with little overhead the fields from BCF2 encoded genotype + * records * - * @author Your Name - * @since Date created + * @author Mark DePristo + * @since 6/12 */ public class BCF2GenotypeFieldDecoders { final protected static Logger logger = Logger.getLogger(BCF2GenotypeFieldDecoders.class); @@ -54,7 +59,8 @@ public class BCF2GenotypeFieldDecoders { // TODO -- fill in appropriate decoders for each FORMAT field in the header genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder()); - genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FLDecoder()); + // currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype + genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new GenericDecoder()); genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder()); genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder()); genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder()); @@ -246,13 +252,6 @@ public class BCF2GenotypeFieldDecoders { } } - private class FLDecoder implements Decoder { - @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { - throw new ReviewedStingException("Genotype filter not implemented in BCF2 yet"); - } - } - private class GenericDecoder implements Decoder { @Override public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List gbs) { @@ -271,16 +270,4 @@ public class BCF2GenotypeFieldDecoders { } } } - - private static final int[] decodeIntArray(final Object value) { - // todo -- decode directly into int[] - final List pls = (List)value; - if ( pls != null ) { // we have a PL field - final int[] x = new int[pls.size()]; - for ( int j = 0; j < x.length; j++ ) - x[j] = pls.get(j); - return x; - } else - return null; - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2TestWalker.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2TestWalker.java deleted file mode 100755 index 001aeee68..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2TestWalker.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.bcf2; - -import org.broad.tribble.FeatureCodecHeader; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; -import org.broadinstitute.sting.utils.variantcontext.writer.Options; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory; - -import java.io.*; -import java.util.*; - -/** - * Testing BCF2 - * - * @author Mark DePristo - * @since 2012 - */ -public class BCF2TestWalker extends RodWalker { - /** - * Variants from this VCF file are used by this tool as input. - * The file must at least contain the standard VCF header lines, but - * can be empty (i.e., no variants are contained in the file). - */ - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) - public RodBinding variants; - - @Argument(doc="keep variants", required=false) - public boolean keepVariants = false; - - @Argument(doc="quiet", required=false) - public boolean quiet = false; - - @Argument(doc="dontIndexOnTheFly", required=false) - public boolean dontIndexOnTheFly = false; - - @Output(doc="File to which results should be written",required=true) - protected File bcfFile; - - private final List vcs = new ArrayList(); - protected VariantContextWriter writer; - - @Override - public void initialize() { - final Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Collections.singletonList(variants)); - final VCFHeader header = VCFUtils.withUpdatedContigs(vcfRods.values().iterator().next(), getToolkit()); - try { - EnumSet options = EnumSet.of(Options.FORCE_BCF); - if ( !dontIndexOnTheFly ) options.add(Options.INDEX_ON_THE_FLY); - writer = VariantContextWriterFactory.create(bcfFile, new FileOutputStream(bcfFile), getToolkit().getMasterSequenceDictionary(), options); - writer.writeHeader(header); - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotCreateOutputFile(bcfFile, e); - } - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) { - writer.add(vc); - if ( keepVariants ) vcs.add(vc); - } - - return 1; - } - - // - // default reduce -- doesn't do anything at all - // - public Integer reduceInit() { return 0; } - public Integer reduce(Integer counter, Integer sum) { return counter + sum; } - - public void onTraversalDone(Integer sum) { - try { - writer.close(); - logger.info("Closed writer"); - - // read in the BCF records - BCF2Codec codec = new BCF2Codec(); - PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(bcfFile)); - FeatureCodecHeader header = codec.readHeader(pbs); - pbs.close(); - - pbs = new PositionalBufferedStream(new FileInputStream(bcfFile)); - pbs.skip(header.getHeaderEnd()); - Iterator it = vcs.iterator(); - while ( ! pbs.isDone() ) { - if ( keepVariants ) { - VariantContext expected = it.next(); - if ( ! quiet ) - System.out.printf("vcf = %s %d %s%n", expected.getChr(), expected.getStart(), expected); - } - VariantContext bcfRaw = codec.decode(pbs); - VariantContext bcf = new VariantContextBuilder(bcfRaw).source("variant").make(); - if ( ! quiet ) { - System.out.printf("bcf = %s %d %s%n", bcf.getChr(), bcf.getStart(), bcf.toString()); - System.out.printf("--------------------------------------------------%n"); - } - } - - } catch ( IOException e ) { - throw new UserException.CouldNotCreateOutputFile(bcfFile, "bad user!"); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java index 9e21ddecd..292850880 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Type.java @@ -24,20 +24,22 @@ package org.broadinstitute.sting.utils.codecs.bcf2; +import com.google.java.contract.Requires; + import java.util.EnumSet; /** - * BCF2 types and information + * BCF2 types and associated information * * @author depristo * @since 05/12 */ public enum BCF2Type { - INT8(1, 1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range - INT16(2, 2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767), - INT32(3, 4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647), - FLOAT(5, 4, BCF2Utils.FLOAT_MISSING_VALUE), - CHAR(7); + INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range + INT16(2, 2, 0xFFFF8000, -32767, 32767), + INT32(3, 4, 0x80000000, -2147483647, 2147483647), + FLOAT(5, 4, 0x7F800001), + CHAR (7); private final int id; private final Object missingJavaValue; @@ -62,13 +64,49 @@ public enum BCF2Type { this.maxValue = maxValue; } + /** + * How many bytes are used to represent this type on disk? + * @return + */ public int getSizeInBytes() { return sizeInBytes; } + + /** + * The ID according to the BCF2 specification + * @return + */ public int getID() { return id; } + + /** + * Can we encode value v in this type, according to its declared range. + * + * Only makes sense for integer values + * + * @param v + * @return + */ + @Requires("INTEGERS.contains(this)") public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } + + /** + * Return the java object (aka null) that is used to represent a missing value for this + * type in Java + * + * @return + */ public Object getMissingJavaValue() { return missingJavaValue; } + + /** + * The bytes (encoded as an int) that are used to represent a missing value + * for this type in BCF2 + * + * @return + */ public int getMissingBytes() { return missingBytes; } + /** + * An enum set of the types that might represent Integer values + */ public final static EnumSet INTEGERS = EnumSet.of(INT8, INT16, INT32); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index ac669667f..fceb60a57 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -53,12 +53,6 @@ public final class BCF2Utils { public static final int OVERFLOW_ELEMENT_MARKER = 15; public static final int MAX_INLINE_ELEMENTS = 14; - // Note that these values are prefixed by FFFFFF for convenience - public static final int INT8_MISSING_VALUE = 0xFFFFFF80; - public static final int INT16_MISSING_VALUE = 0xFFFF8000; - public static final int INT32_MISSING_VALUE = 0x80000000; - public static final int FLOAT_MISSING_VALUE = 0x7F800001; - public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32}; public final static BCF2Type[] ID_TO_ENUM; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 79feb13ed..0f07937da 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -249,11 +249,12 @@ public class VCFCodec extends AbstractVCFCodec { return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset()); } + private final static String[] INT_DECODE_ARRAY = new String[10000]; private final static int[] decodeInts(final String string) { - final String[] splits = string.split(","); - final int[] values = new int[splits.length]; - for ( int i = 0; i < splits.length; i++ ) - values[i] = Integer.valueOf(splits[i]); + final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ','); + final int[] values = new int[nValues]; + for ( int i = 0; i < nValues; i++ ) + values[i] = Integer.valueOf(INT_DECODE_ARRAY[i]); return values; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index 90f0381c9..400f85761 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -34,10 +34,23 @@ import java.util.*; /** * A builder class for genotypes * + * Provides convenience setter methods for all of the Genotype field + * values. Setter methods can be used in any order, allowing you to + * pass through states that wouldn't be allowed in the highly regulated + * immutable Genotype class. + * + * All fields default to meaningful MISSING values. + * + * Call make() to actually create the corresponding Genotype object from + * this builder. Can be called multiple times to create independent copies, + * or with intervening sets to conveniently make similar Genotypes with + * slight modifications. + * * @author Mark DePristo + * @since 06/12 */ public final class GenotypeBuilder { - public static boolean MAKE_FAST_BY_DEFAULT = false; + public static boolean MAKE_FAST_BY_DEFAULT = true; private String sampleName = null; private List alleles = null; @@ -48,6 +61,7 @@ public final class GenotypeBuilder { private int[] AD = null; private int[] PL = null; private Map extendedAttributes = null; + private int initialAttributeMapSize = 5; private boolean useFast = MAKE_FAST_BY_DEFAULT; @@ -191,6 +205,11 @@ public final class GenotypeBuilder { return this; } + /** + * Set this genotype's name + * @param sampleName + * @return + */ @Requires({"sampleName != null"}) @Ensures({"this.sampleName != null"}) public GenotypeBuilder name(final String sampleName) { @@ -198,6 +217,11 @@ public final class GenotypeBuilder { return this; } + /** + * Set this genotype's alleles + * @param alleles + * @return + */ @Ensures({"this.alleles != null"}) public GenotypeBuilder alleles(final List alleles) { if ( alleles == null ) @@ -207,6 +231,11 @@ public final class GenotypeBuilder { return this; } + /** + * Is this genotype phased? + * @param phased + * @return + */ public GenotypeBuilder phased(final boolean phased) { isPhased = phased; return this; @@ -235,11 +264,34 @@ public final class GenotypeBuilder { return GQ((int)Math.round(pLog10Error * -10)); } + /** + * This genotype has no GQ value + * @return + */ public GenotypeBuilder noGQ() { GQ = -1; return this; } + + /** + * This genotype has no AD value + * @return + */ public GenotypeBuilder noAD() { AD = null; return this; } + + /** + * This genotype has no DP value + * @return + */ public GenotypeBuilder noDP() { DP = -1; return this; } + + /** + * This genotype has no PL value + * @return + */ public GenotypeBuilder noPL() { PL = null; return this; } + /** + * This genotype has this DP value + * @return + */ @Requires({"DP >= -1"}) @Ensures({"this.DP == DP"}) public GenotypeBuilder DP(final int DP) { @@ -247,6 +299,10 @@ public final class GenotypeBuilder { return this; } + /** + * This genotype has this AD value + * @return + */ @Requires({"AD == null || AD.length > 0"}) @Ensures({"this.AD == AD"}) public GenotypeBuilder AD(final int[] AD) { @@ -254,6 +310,10 @@ public final class GenotypeBuilder { return this; } + /** + * This genotype has this PL value, as int[]. FAST + * @return + */ @Requires("PL == null || PL.length > 0") @Ensures({"this.PL == PL"}) public GenotypeBuilder PL(final int[] PL) { @@ -261,6 +321,10 @@ public final class GenotypeBuilder { return this; } + /** + * This genotype has this PL value, converted from double[]. SLOW + * @return + */ @Requires("PL == null || PL.length > 0") @Ensures({"this.PL == PL"}) public GenotypeBuilder PL(final double[] GLs) { @@ -268,6 +332,12 @@ public final class GenotypeBuilder { return this; } + /** + * This genotype has these attributes. + * + * Cannot contain inline attributes (DP, AD, GQ, PL) + * @return + */ @Requires("attributes != null") @Ensures("attributes.isEmpty() || extendedAttributes != null") public GenotypeBuilder attributes(final Map attributes) { @@ -276,11 +346,17 @@ public final class GenotypeBuilder { return this; } + /** + * This genotype has this attribute key / value pair. + * + * Cannot contain inline attributes (DP, AD, GQ, PL) + * @return + */ @Requires({"key != null"}) @Ensures({"extendedAttributes != null", "extendedAttributes.containsKey(key)"}) public GenotypeBuilder attribute(final String key, final Object value) { if ( extendedAttributes == null ) - extendedAttributes = new HashMap(5); + extendedAttributes = new HashMap(initialAttributeMapSize); extendedAttributes.put(key, value); return this; } @@ -299,8 +375,34 @@ public final class GenotypeBuilder { return this; } + /** + * varargs version of #filters + * @param filters + * @return + */ + @Requires("filters != null") + public GenotypeBuilder filters(final String ... filters) { + return filters(Arrays.asList(filters)); + } + + /** + * This genotype is unfiltered + * + * @return + */ + @Requires("filters != null") + public GenotypeBuilder unfiltered() { + if ( extendedAttributes != null ) + extendedAttributes.remove(VCFConstants.GENOTYPE_FILTER_KEY); + return this; + } + + /** + * Tell's this builder that we have at most these number of attributes + * @return + */ public GenotypeBuilder maxAttributes(final int i) { - // TODO + initialAttributeMapSize = i; return this; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 961d4d495..fa41a3c99 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -48,6 +48,7 @@ public class GenotypeLikelihoods { return new GenotypeLikelihoods(PLs); } + @Deprecated public final static GenotypeLikelihoods fromGLField(String GLs) { return new GenotypeLikelihoods(parseDeprecatedGLString(GLs)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 64f901451..1783cd27f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.variantcontext.writer; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec; @@ -39,7 +41,6 @@ import java.io.*; import java.util.*; class BCF2Writer extends IndexingVariantContextWriter { - private final static boolean FULLY_DECODE = true; final protected static Logger logger = Logger.getLogger(BCF2Writer.class); private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support @@ -63,12 +64,6 @@ class BCF2Writer extends IndexingVariantContextWriter { // // -------------------------------------------------------------------------------- - private final void createContigDictionary(final Collection contigLines) { - int offset = 0; - for ( VCFContigHeaderLine contig : contigLines ) - contigDictionary.put(contig.getID(), offset++); - } - @Override public void writeHeader(final VCFHeader header) { // create the config offsets map @@ -103,8 +98,11 @@ class BCF2Writer extends IndexingVariantContextWriter { } @Override - public void add( final VariantContext initialVC ) { - final VariantContext vc = FULLY_DECODE ? initialVC.fullyDecode(header) : initialVC; + public void add( VariantContext vc ) { + if ( doNotWriteGenotypes ) + vc = new VariantContextBuilder(vc).noGenotypes().make(); + vc = vc.fullyDecode(header); + super.add(vc); // allow on the fly indexing try { @@ -253,9 +251,13 @@ class BCF2Writer extends IndexingVariantContextWriter { } } + // TODO TODO TODO TODO TODO + // TODO // TODO -- we really need explicit converters as first class objects // TODO -- need to generalize so we can enable vectors of compressed genotype ints // TODO -- no sense in allocating these over and over + // TODO + // TODO TODO TODO TODO TODO private final VCFToBCFEncoding prepFieldValueForEncoding(final String field, final Object value) { final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, field); final boolean isList = value instanceof List; @@ -269,8 +271,8 @@ class BCF2Writer extends IndexingVariantContextWriter { case Flag: return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1)); case String: - final List s = isList ? (List)value : Collections.singletonList((String) value); - return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s); + final String s = isList ? BCF2Utils.collapseStringList((List)value) : (String)value; + return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(s)); case Integer: // note integer calculation is a bit complex because of the need to determine sizes List l; BCF2Type intType; @@ -297,19 +299,6 @@ class BCF2Writer extends IndexingVariantContextWriter { } } - private final void addGenotypeFilters(final VariantContext vc) throws IOException { - logger.warn("Skipping genotype filter field"); -// // TODO -- FIXME -- string is wrong here -- need to compute string size... -// startGenotypeField(VCFConstants.GENOTYPE_FILTER_KEY, 1, BCFType.CHAR); -// for ( final Genotype g : vc.getGenotypes() ) { -// if ( g.filtersWereApplied() && g.isFiltered() ) { -// encoder.encodeString(ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters()))); -// } else { -// encoder.encodeRawMissingValues(1, BCFType.CHAR); // todo fixme -// } -// } - } - // -------------------------------------------------------------------------------- // // Genotype field encoding @@ -319,10 +308,8 @@ class BCF2Writer extends IndexingVariantContextWriter { private byte[] buildSamplesData(final VariantContext vc) throws IOException { BCF2Codec.LazyData lazyData = getLazyData(vc); if ( lazyData != null ) { - logger.info("Lazy decoded data detected, writing bytes directly to stream. N = " + lazyData.bytes.length); return lazyData.bytes; } else { - final GenotypesContext gc = vc.getGenotypes(); // we have to do work to convert the VC into a BCF2 byte stream List genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header); for ( final String field : genotypeFields ) { @@ -330,8 +317,6 @@ class BCF2Writer extends IndexingVariantContextWriter { addGenotypes(vc); } else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) { addIntGenotypeField(field, intGenotypeFieldAccessors.getAccessor(field), vc); - } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { - addGenotypeFilters(vc); } else { addGenericGenotypeField(vc, field); } @@ -388,6 +373,9 @@ class BCF2Writer extends IndexingVariantContextWriter { if ( numInFormatField == 1 ) { // we encode the actual allele, encodeRawValue handles the missing case where fieldValue == null encoder.encodeRawValue(fieldValue, encoding.BCF2Type); + } else if ( encoding.vcfType == VCFHeaderLineType.String ) { + // we have multiple strings, so we need to collapse them + throw new ReviewedStingException("LIMITATION: Cannot encode arrays of string values in genotypes"); } else { // multiple values, need to handle general case final List asList = toList(fieldValue); @@ -402,6 +390,12 @@ class BCF2Writer extends IndexingVariantContextWriter { } } + // TODO TODO TODO TODO TODO + // TODO + // TODO THIS ROUTINE NEEDS TO BE OPTIMIZED. IT ACCOUNTS FOR A SIGNIFICANT AMOUNT OF THE + // TODO RUNTIME FOR WRITING OUT BCF FILES WITH MANY GENOTYPES + // TODO + // TODO TODO TODO TODO TODO private final void addIntGenotypeField(final String field, final IntGenotypeFieldAccessors.Accessor ige, final VariantContext vc) throws IOException { @@ -429,13 +423,19 @@ class BCF2Writer extends IndexingVariantContextWriter { encoder.encodePrimitive(pl == -1 ? type.getMissingBytes() : pl, type); } + // TODO TODO TODO TODO TODO + // TODO + // TODO we should really have a fast path for encoding diploid genotypes where + // TODO we don't pay the overhead of creating the allele maps + // TODO + // TODO TODO TODO TODO TODO private final void addGenotypes(final VariantContext vc) throws IOException { if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES ) throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " + "with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have " + vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart()); - final Map alleleMap = VCFWriter.buildAlleleMap(vc); + final Map alleleMap = buildAlleleMap(vc); final int maxPloidy = vc.getMaxPloidy(); startGenotypeField(VCFConstants.GENOTYPE_KEY, maxPloidy, BCF2Type.INT8); for ( final String name : header.getGenotypeSamples() ) { @@ -446,7 +446,7 @@ class BCF2Writer extends IndexingVariantContextWriter { if ( i < samplePloidy ) { // we encode the actual allele final Allele a = alleles.get(i); - final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a)); + final int offset = alleleMap.get(a); final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00); encoder.encodePrimitive(encoded, BCF2Type.INT8); } else { @@ -457,6 +457,18 @@ class BCF2Writer extends IndexingVariantContextWriter { } } + private final static Map buildAlleleMap(final VariantContext vc) { + final Map alleleMap = new HashMap(vc.getAlleles().size()+1); + alleleMap.put(Allele.NO_CALL, -1); // convenience for lookup + + final List alleles = vc.getAlleles(); + for ( int i = 0; i < alleles.size(); i++ ) { + alleleMap.put(alleles.get(i), i); + } + + return alleleMap; + } + // -------------------------------------------------------------------------------- // // Low-level block encoding @@ -470,24 +482,26 @@ class BCF2Writer extends IndexingVariantContextWriter { * * @throws IOException */ + @Requires({"infoBlock.length > 0", "genotypesBlock.length >= 0"}) private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException { - assert infoBlock.length > 0; - assert genotypesBlock.length >= 0; - BCF2Encoder.encodePrimitive(infoBlock.length, BCF2Type.INT32, outputStream); BCF2Encoder.encodePrimitive(genotypesBlock.length, BCF2Type.INT32, outputStream); outputStream.write(infoBlock); outputStream.write(genotypesBlock); } - // TODO -- obvious optimization case + @Requires("string != null") + @Ensures("BCF2Type.INTEGERS.contains(result)") private final BCF2Type encodeStringByRef(final String string) throws IOException { - assert string != null; - - return encodeStringsByRef(Collections.singletonList(string)); + final Integer offset = stringDictionaryMap.get(string); + if ( offset == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF"); + final BCF2Type type = encoder.determineIntegerType(offset); + encoder.encodeTyped(offset, type); + return type; } - // TODO -- in size == 1 case branch to singleoton fast-path + @Requires("! strings.isEmpty()") + @Ensures("BCF2Type.INTEGERS.contains(result)") private final BCF2Type encodeStringsByRef(final Collection strings) throws IOException { assert ! strings.isEmpty(); @@ -517,17 +531,38 @@ class BCF2Writer extends IndexingVariantContextWriter { return maxType; } + @Requires({"key != null", "! key.equals(\"\")", "size >= 0"}) private final void startGenotypeField(final String key, final int size, final BCF2Type valueType) throws IOException { - assert key != null && ! key.equals(""); - assert size >= 0; - encodeStringByRef(key); encoder.encodeType(size, valueType); } + /** + * Helper function that takes an object and returns a list representation + * of it: + * + * o == null => [] + * o is a list => o + * else => [o] + * + * @param o + * @return + */ private final static List toList(final Object o) { if ( o == null ) return Collections.emptyList(); else if ( o instanceof List ) return (List)o; else return Collections.singletonList(o); } + + /** + * Create the contigDictionary from the contigLines extracted from the VCF header + * + * @param contigLines + */ + @Requires("contigDictionary.isEmpty()") + private final void createContigDictionary(final Collection contigLines) { + int offset = 0; + for ( VCFContigHeaderLine contig : contigLines ) + contigDictionary.put(contig.getID(), offset++); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 380bcf4ae..ee65497f1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -251,7 +251,7 @@ class VCFWriter extends IndexingVariantContextWriter { } } - public static Map buildAlleleMap(final VariantContext vc) { + private static Map buildAlleleMap(final VariantContext vc) { final Map alleleMap = new HashMap(vc.getAlleles().size()+1); alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 9b05bd30b..da8625411 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -161,9 +161,13 @@ public class VariantContextTestProvider { metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x")); metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x")); - metaData.add(new VCFInfoHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")); - metaData.add(new VCFInfoHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")); - metaData.add(new VCFInfoHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")); + metaData.add(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")); + metaData.add(new VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + metaData.add(new VCFFormatHeaderLine("GS", 2, VCFHeaderLineType.String, "A doubleton list of strings")); + metaData.add(new VCFFormatHeaderLine("GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "A variable list of strings")); + metaData.add(new VCFFormatHeaderLine("FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype filters")); + // prep the header metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0)); @@ -386,7 +390,53 @@ public class VariantContextTestProvider { attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0), attr("g2", ref, "FLOAT3")); - // test test Integer, Float, Flag, String atomic, vector, and missing types of different lengths per sample + // + // + // TESTING MULTIPLE SIZED LISTS IN THE GENOTYPE FIELD + // + // + addGenotypeTests(site, + attr("g1", ref, "GS", Arrays.asList("S1", "S2")), + attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); + + addGenotypeTests(site, // g1 is missing the string, and g2 is missing FLOAT1 + attr("g1", ref, "FLOAT1", 1.0), + attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); + + // variable sized lists + addGenotypeTests(site, + attr("g1", ref, "GV", Arrays.asList("S1")), + attr("g2", ref, "GV", Arrays.asList("S3", "S4"))); + + addGenotypeTests(site, + attr("g1", ref, "GV", Arrays.asList("S1", "S2")), + attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); + + addGenotypeTests(site, // missing value in varlist of string + attr("g1", ref, "FLOAT1", 1.0), + attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); + + + // + // + // TESTING GENOTYPE FILTERS + // + // + addGenotypeTests(site, + new GenotypeBuilder("g1", Arrays.asList(ref, ref)).filters("X").make(), + new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X", "Y").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make(), + new GenotypeBuilder("g3", Arrays.asList(ref, ref)).filters("X", "Y").make()); + + // TODO -- test test Integer, Float, Flag, String atomic, vector, and missing types of different lengths per sample } private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) { @@ -493,7 +543,27 @@ public class VariantContextTestProvider { Assert.assertEquals(actual.getSampleName(), expected.getSampleName()); Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString()); + Assert.assertEquals(actual.getType(), expected.getType()); + + // filters are the same Assert.assertEquals(actual.getFilters(), expected.getFilters()); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered()); + Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied()); + + // inline attributes + Assert.assertEquals(actual.getDP(), expected.getDP()); + Assert.assertEquals(actual.getAD(), expected.getAD()); + Assert.assertEquals(actual.getGQ(), expected.getGQ()); + Assert.assertEquals(actual.hasPL(), expected.hasPL()); + Assert.assertEquals(actual.hasAD(), expected.hasAD()); + Assert.assertEquals(actual.hasGQ(), expected.hasGQ()); + Assert.assertEquals(actual.hasDP(), expected.hasDP()); + + Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods()); + Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString()); + Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods()); + Assert.assertEquals(actual.getPL(), expected.getPL()); + Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual()); assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); Assert.assertEquals(actual.isPhased(), expected.isPhased());