Finalizing BCF2 v2

-- FastGenotypes are the default in the engine.  Use --useSlowGenotypes engine argument to return to old representation
-- Cleanup of BCF2Codec.  Good error handling.  Added contracts and docs.
-- Added a few more contacts and docs to BCF2Decoder
-- Optimized encodePrimitive in BCF2Encoder
-- Removed genotype filter field exceptions
-- Docs and cleanup of BCF2GenotypeFieldDecoders
-- Deleted unused BCF2TestWalker
-- Docs and cleanup of BCF2Types
-- Faster version of decodeInts in VCFCodec
-- BCF2Writer
    -- Support for writing a sites only file
    -- Lots of TODOs for future optimizations
    -- Removed lack of filter field support
    -- No longer uses the alleleMap from VCFWriter, which was a Allele -> String, now uses Allele -> Integer which is faster and more natural
    -- Lots of docs and contracts
-- Docs for GenotypeBuilder.  More filter creation routines (unfiltered, for example)
-- More extensive tests in VariantContextTestProfiler, including variable length strings in genotypes and genotype filters.  Better genotype comparisons
This commit is contained in:
Mark DePristo 2012-06-07 08:34:25 -04:00
parent 938d3ada32
commit 43ad890fcc
15 changed files with 508 additions and 294 deletions

View File

@ -223,8 +223,8 @@ public class GenomeAnalysisEngine {
resetRandomGenerator(System.currentTimeMillis());
// TODO -- REMOVE ME WHEN WE STOP BCF testing
if ( this.getArguments().USE_FAST_GENOTYPES )
GenotypeBuilder.MAKE_FAST_BY_DEFAULT = true;
if ( this.getArguments().USE_SLOW_GENOTYPES )
GenotypeBuilder.MAKE_FAST_BY_DEFAULT = false;
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null)

View File

@ -336,9 +336,9 @@ public class GATKArgumentCollection {
public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
@Argument(fullName="useFastGenotypes",shortName = "useFastGenotypes",doc="",required=false)
@Argument(fullName="useSlowGenotypes",shortName = "useSlowGenotypes",doc="",required=false)
@Hidden
public boolean USE_FAST_GENOTYPES = false;
public boolean USE_SLOW_GENOTYPES = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
/**

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
@ -43,16 +45,45 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
/**
* Decode BCF2 files
*/
public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
private VCFHeader header = null;
/**
* Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field
*/
private final ArrayList<String> contigNames = new ArrayList<String>();
/**
* Maps header string names (encoded in VCF) into strings found in the BCF header
*
* Initialized when processing the header
*/
private ArrayList<String> dictionary;
/**
* Our decoder that reads low-level objects from the BCF2 records
*/
private final BCF2Decoder decoder = new BCF2Decoder();
private boolean skipGenotypes = false;
/**
* Provides some sanity checking on the header
*/
private final static int MAX_HEADER_SIZE = 0x08000000;
/**
* Genotype field decoders that are initialized when the header is read
*/
private BCF2GenotypeFieldDecoders gtFieldDecoders = null;
// for error handling
private int recordNo = 0;
private int pos = 0;
// ----------------------------------------------------------------------
//
// Feature codec interface functions
@ -61,28 +92,30 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
@Override
public Feature decodeLoc( final PositionalBufferedStream inputStream ) {
return decode(inputStream);
// TODO: a less expensive version of decodeLoc() that doesn't use VariantContext
// TODO: very easy -- just decodeSitesBlock, and then skip to end of end of sites block
// TODO: and then skip genotypes block
recordNo++;
final VariantContextBuilder builder = new VariantContextBuilder();
final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream); // necessary because it's in the stream
decoder.readNextBlock(sitesBlockSize, inputStream);
decodeSiteLoc(builder);
return builder.fullyDecoded(true).make();
}
@Override
public VariantContext decode( final PositionalBufferedStream inputStream ) {
recordNo++;
final VariantContextBuilder builder = new VariantContextBuilder();
final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream);
decoder.readNextBlock(sitesBlockSize, inputStream);
final SitesInfoForDecoding info = decodeSitesBlock(builder);
if ( isSkippingGenotypes() ) {
decoder.skipNextBlock(genotypeBlockSize, inputStream);
} else {
decoder.readNextBlock(genotypeBlockSize, inputStream);
createLazyGenotypesDecoder(info, builder);
}
decodeSiteLoc(builder);
final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder);
decoder.readNextBlock(genotypeBlockSize, inputStream);
createLazyGenotypesDecoder(info, builder);
return builder.fullyDecoded(true).make();
}
@ -96,16 +129,16 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
try {
// note that this reads the magic as well, and so does double duty
if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
throw new UserException.MalformedBCF2("Input stream does not begin with BCF2 magic");
error("Input stream does not begin with BCF2 magic");
final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);
if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
final byte[] headerBytes = new byte[headerSizeInBytes];
if ( inputStream.read(headerBytes) != headerSizeInBytes )
throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
final AsciiLineReader headerReader = new AsciiLineReader(bps);
@ -120,8 +153,11 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
if ( ! header.getContigLines().isEmpty() ) {
logger.info("Found contig lines in BCF2 file, using those");
contigNames.clear();
for ( final VCFContigHeaderLine contig : header.getContigLines())
for ( final VCFContigHeaderLine contig : header.getContigLines()) {
if ( contig.getID() == null || contig.getID().equals("") )
error("found a contig with an invalid ID " + contig);
contigNames.add(contig.getID());
}
} else {
logger.info("Didn't find any contig lines in BCF2 file, falling back (dangerously) to GATK reference dictionary");
}
@ -161,7 +197,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
//
// --------------------------------------------------------------------------------
@Override
public void setGenomeLocParser(final GenomeLocParser genomeLocParser) {
// initialize contigNames to standard ones in reference
@ -169,14 +204,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
contigNames.add(contig.getSequenceName());
}
public boolean isSkippingGenotypes() {
return skipGenotypes;
}
public void setSkipGenotypes(final boolean skipGenotypes) {
this.skipGenotypes = skipGenotypes;
}
// --------------------------------------------------------------------------------
//
// implicit block
@ -190,16 +217,33 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
//
// --------------------------------------------------------------------------------
private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) {
/**
* Decode the sites level data from this classes decoder
*
* @param builder
* @return
*/
@Requires({"builder != null"})
private final void decodeSiteLoc(final VariantContextBuilder builder) {
final int contigOffset = decoder.decodeInt(BCF2Type.INT32);
final String contig = lookupContigName(contigOffset);
builder.chr(contig);
final int pos = decoder.decodeInt(BCF2Type.INT32);
this.pos = decoder.decodeInt(BCF2Type.INT32);
final int refLength = decoder.decodeInt(BCF2Type.INT32);
builder.start((long)pos);
builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals
}
/**
* Decode the sites level data from this classes decoder
*
* @param builder
* @return
*/
@Requires({"builder != null", "decoder != null"})
@Ensures({"result != null", "result.isValid()"})
private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) {
final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT);
if ( qual != null ) {
builder.log10PError(((Double)qual) / -10.0);
@ -217,23 +261,39 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
decodeFilter(builder);
decodeInfo(builder, nInfo);
return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles);
final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles);
if ( ! info.isValid() )
error("Sites info is malformed: " + info);
return info;
}
protected final static class SitesInfoForDecoding {
final int pos;
final int nFormatFields;
final int nSamples;
final ArrayList<Allele> alleles;
private SitesInfoForDecoding(final int pos, final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
this.pos = pos;
private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
this.nFormatFields = nFormatFields;
this.nSamples = nSamples;
this.alleles = alleles;
}
public boolean isValid() {
return nFormatFields >= 0 &&
nSamples >= 0 &&
alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference();
}
@Override
public String toString() {
return String.format("nFormatFields = %d, nSamples = %d, alleles = %s", nFormatFields, nSamples, alleles);
}
}
/**
* Decode the id field in this BCF2 file and store it in the builder
* @param builder
*/
private void decodeID( final VariantContextBuilder builder ) {
final String id = (String)decoder.decodeTypedValue();
@ -243,6 +303,15 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.id(id);
}
/**
* Annoying routine that deals with allele clipping from the BCF2 encoding to the standard
* GATK encoding.
*
* @param position
* @param ref
* @param unclippedAlleles
* @return
*/
protected static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
@ -252,6 +321,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
return unclippedAlleles;
}
/**
* Decode the alleles from this BCF2 file and put the results in builder
* @param builder
* @param pos
* @param nAlleles
* @return the alleles
*/
@Requires("nAlleles > 0")
private ArrayList<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) {
// TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
ArrayList<Allele> alleles = new ArrayList<Allele>(nAlleles);
@ -267,15 +344,21 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
alleles.add(Allele.create(allele, false));
}
}
assert ref != null;
alleles = clipAllelesIfNecessary(pos, ref, alleles);
builder.alleles(alleles);
assert ref.length() > 0;
builder.referenceBaseForIndel(ref.getBytes()[0]);
return alleles;
}
/**
* Decode the filter field of this BCF2 file and store the result in the builder
* @param builder
*/
private void decodeFilter( final VariantContextBuilder builder ) {
final Object value = decoder.decodeTypedValue();
@ -283,17 +366,28 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.unfiltered();
else {
if ( value instanceof Integer )
// fast path for single integer result
builder.filter(getDictionaryString((Integer)value));
else {
for ( int offset : (List<Integer>)value )
for ( final int offset : (List<Integer>)value )
builder.filter(getDictionaryString(offset));
}
}
}
/**
* Loop over the info field key / value pairs in this BCF2 file and decode them into the builder
*
* @param builder
* @param numInfoFields
*/
@Requires("numInfoFields >= 0")
private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) {
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
if ( numInfoFields == 0 )
// fast path, don't bother doing any work if there are no fields
return;
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
for ( int i = 0; i < numInfoFields; i++ ) {
final String key = getDictionaryString();
Object value = decoder.decodeTypedValue();
@ -340,47 +434,63 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
final public int nGenotypeFields;
final public byte[] bytes;
@Requires({"nGenotypeField > 0", "bytes != null"})
public LazyData(final int nGenotypeFields, final byte[] bytes) {
this.nGenotypeFields = nGenotypeFields;
this.bytes = bytes;
}
}
@Ensures("result != null")
private final String getDictionaryString() {
return getDictionaryString((Integer) decoder.decodeTypedValue());
}
@Requires("offset >= dictionary.size()")
@Ensures("result != null")
protected final String getDictionaryString(final int offset) {
if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset);
final String field = dictionary.get(offset);
return field;
return dictionary.get(offset);
}
/**
* Translate the config offset as encoded in the BCF file into the actual string
* name of the contig from the dictionary
*
* @param contigOffset
* @return
*/
@Requires({"contigOffset >= 0", "contigOffset < contigNames.size()"})
@Ensures("result != null")
private final String lookupContigName( final int contigOffset ) {
if ( contigOffset < contigNames.size() ) {
return contigNames.get(contigOffset);
}
else {
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
}
return contigNames.get(contigOffset);
}
@Requires("header != null")
@Ensures({"result != null", "! result.isEmpty()"})
private final ArrayList<String> parseDictionary(final VCFHeader header) {
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
// if we got here we never found a dictionary, or there are no elements in the dictionary
if ( dict.size() == 0 )
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
if ( dict.isEmpty() )
error("Dictionary header element was absent or empty");
return dict;
}
/**
* @return the VCFHeader we found in this BCF2 file
*/
protected VCFHeader getHeader() {
return header;
}
@Requires("field != null")
@Ensures("result != null")
protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) {
return gtFieldDecoders.getDecoder(field);
}
private final void error(final String message) throws RuntimeException {
throw new UserException.MalformedBCF2(String.format("At record %d with position %d:", recordNo, pos, message));
}
}

View File

@ -40,8 +40,8 @@ import java.util.Arrays;
public final class BCF2Decoder {
final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
byte[] recordBytes;
ByteArrayInputStream recordStream;
byte[] recordBytes = null;
ByteArrayInputStream recordStream = null;
public BCF2Decoder() {
// nothing to do
@ -69,6 +69,7 @@ public final class BCF2Decoder {
* @return
*/
public void readNextBlock(final int blockSizeInBytes, final InputStream stream) {
if ( blockSizeInBytes < 0 ) throw new UserException.MalformedBCF2("Invalid block size " + blockSizeInBytes);
setRecordBytes(readRecordBytes(blockSizeInBytes, stream));
}
@ -115,9 +116,9 @@ public final class BCF2Decoder {
*
* @param recordBytes
*/
@Requires("recordBytes != null")
@Ensures({"this.recordBytes == recordBytes", "recordStream != null"})
public void setRecordBytes(final byte[] recordBytes) {
assert recordBytes != null;
this.recordBytes = recordBytes;
this.recordStream = new ByteArrayInputStream(recordBytes);
}

View File

@ -32,7 +32,7 @@ import java.io.OutputStream;
import java.util.*;
/**
* Simple BCF2 encoder
* BCF2 encoder
*
* @author depristo
* @since 5/12
@ -255,12 +255,30 @@ public final class BCF2Encoder {
}
public final static void encodePrimitive(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException {
for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
final int shift = i * 8;
int mask = 0xFF << shift;
int byteValue = (mask & value) >> shift;
encodeStream.write(byteValue);
switch ( type.getSizeInBytes() ) {
case 1:
encodeStream.write(0xFF & value);
break;
case 2:
encodeStream.write((0xFF00 & value) >> 8);
encodeStream.write(0xFF & value);
break;
case 4:
encodeStream.write((0xFF000000 & value) >> 24);
encodeStream.write((0x00FF0000 & value) >> 16);
encodeStream.write((0x0000FF00 & value) >> 8);
encodeStream.write((0x000000FF & value));
break;
default:
throw new ReviewedStingException("BUG: unexpected type size " + type);
}
// general case for reference
// for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
// final int shift = i * 8;
// int mask = 0xFF << shift;
// int byteValue = (mask & value) >> shift;
// encodeStream.write(byteValue);
// }
}
private final List<Byte> stringToBytes(final String v) throws IOException {

View File

@ -29,17 +29,22 @@ import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
/**
* An efficient
* An efficient scheme for building and obtaining specialized
* genotype field decoders. Used by the BCFCodec to parse
* with little overhead the fields from BCF2 encoded genotype
* records
*
* @author Your Name
* @since Date created
* @author Mark DePristo
* @since 6/12
*/
public class BCF2GenotypeFieldDecoders {
final protected static Logger logger = Logger.getLogger(BCF2GenotypeFieldDecoders.class);
@ -54,7 +59,8 @@ public class BCF2GenotypeFieldDecoders {
// TODO -- fill in appropriate decoders for each FORMAT field in the header
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FLDecoder());
// currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new GenericDecoder());
genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder());
genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder());
@ -246,13 +252,6 @@ public class BCF2GenotypeFieldDecoders {
}
}
private class FLDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
throw new ReviewedStingException("Genotype filter not implemented in BCF2 yet");
}
}
private class GenericDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
@ -271,16 +270,4 @@ public class BCF2GenotypeFieldDecoders {
}
}
}
private static final int[] decodeIntArray(final Object value) {
// todo -- decode directly into int[]
final List<Integer> pls = (List<Integer>)value;
if ( pls != null ) { // we have a PL field
final int[] x = new int[pls.size()];
for ( int j = 0; j < x.length; j++ )
x[j] = pls.get(j);
return x;
} else
return null;
}
}

View File

@ -1,143 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.*;
import java.util.*;
/**
* Testing BCF2
*
* @author Mark DePristo
* @since 2012
*/
public class BCF2TestWalker extends RodWalker<Integer, Integer> {
/**
* Variants from this VCF file are used by this tool as input.
* The file must at least contain the standard VCF header lines, but
* can be empty (i.e., no variants are contained in the file).
*/
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
public RodBinding<VariantContext> variants;
@Argument(doc="keep variants", required=false)
public boolean keepVariants = false;
@Argument(doc="quiet", required=false)
public boolean quiet = false;
@Argument(doc="dontIndexOnTheFly", required=false)
public boolean dontIndexOnTheFly = false;
@Output(doc="File to which results should be written",required=true)
protected File bcfFile;
private final List<VariantContext> vcs = new ArrayList<VariantContext>();
protected VariantContextWriter writer;
@Override
public void initialize() {
final Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Collections.singletonList(variants));
final VCFHeader header = VCFUtils.withUpdatedContigs(vcfRods.values().iterator().next(), getToolkit());
try {
EnumSet<Options> options = EnumSet.of(Options.FORCE_BCF);
if ( !dontIndexOnTheFly ) options.add(Options.INDEX_ON_THE_FLY);
writer = VariantContextWriterFactory.create(bcfFile, new FileOutputStream(bcfFile), getToolkit().getMasterSequenceDictionary(), options);
writer.writeHeader(header);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, e);
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
writer.add(vc);
if ( keepVariants ) vcs.add(vc);
}
return 1;
}
//
// default reduce -- doesn't do anything at all
//
public Integer reduceInit() { return 0; }
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
public void onTraversalDone(Integer sum) {
try {
writer.close();
logger.info("Closed writer");
// read in the BCF records
BCF2Codec codec = new BCF2Codec();
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
FeatureCodecHeader header = codec.readHeader(pbs);
pbs.close();
pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
pbs.skip(header.getHeaderEnd());
Iterator<VariantContext> it = vcs.iterator();
while ( ! pbs.isDone() ) {
if ( keepVariants ) {
VariantContext expected = it.next();
if ( ! quiet )
System.out.printf("vcf = %s %d %s%n", expected.getChr(), expected.getStart(), expected);
}
VariantContext bcfRaw = codec.decode(pbs);
VariantContext bcf = new VariantContextBuilder(bcfRaw).source("variant").make();
if ( ! quiet ) {
System.out.printf("bcf = %s %d %s%n", bcf.getChr(), bcf.getStart(), bcf.toString());
System.out.printf("--------------------------------------------------%n");
}
}
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, "bad user!");
}
}
}

View File

@ -24,20 +24,22 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Requires;
import java.util.EnumSet;
/**
* BCF2 types and information
* BCF2 types and associated information
*
* @author depristo
* @since 05/12
*/
public enum BCF2Type {
INT8(1, 1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
INT16(2, 2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767),
INT32(3, 4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647),
FLOAT(5, 4, BCF2Utils.FLOAT_MISSING_VALUE),
CHAR(7);
INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range
INT16(2, 2, 0xFFFF8000, -32767, 32767),
INT32(3, 4, 0x80000000, -2147483647, 2147483647),
FLOAT(5, 4, 0x7F800001),
CHAR (7);
private final int id;
private final Object missingJavaValue;
@ -62,13 +64,49 @@ public enum BCF2Type {
this.maxValue = maxValue;
}
/**
* How many bytes are used to represent this type on disk?
* @return
*/
public int getSizeInBytes() {
return sizeInBytes;
}
/**
* The ID according to the BCF2 specification
* @return
*/
public int getID() { return id; }
/**
* Can we encode value v in this type, according to its declared range.
*
* Only makes sense for integer values
*
* @param v
* @return
*/
@Requires("INTEGERS.contains(this)")
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
/**
* Return the java object (aka null) that is used to represent a missing value for this
* type in Java
*
* @return
*/
public Object getMissingJavaValue() { return missingJavaValue; }
/**
* The bytes (encoded as an int) that are used to represent a missing value
* for this type in BCF2
*
* @return
*/
public int getMissingBytes() { return missingBytes; }
/**
* An enum set of the types that might represent Integer values
*/
public final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
}

View File

@ -53,12 +53,6 @@ public final class BCF2Utils {
public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14;
// Note that these values are prefixed by FFFFFF for convenience
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
public static final int INT32_MISSING_VALUE = 0x80000000;
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
public final static BCF2Type[] ID_TO_ENUM;

View File

@ -249,11 +249,12 @@ public class VCFCodec extends AbstractVCFCodec {
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
}
private final static String[] INT_DECODE_ARRAY = new String[10000];
private final static int[] decodeInts(final String string) {
final String[] splits = string.split(",");
final int[] values = new int[splits.length];
for ( int i = 0; i < splits.length; i++ )
values[i] = Integer.valueOf(splits[i]);
final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ',');
final int[] values = new int[nValues];
for ( int i = 0; i < nValues; i++ )
values[i] = Integer.valueOf(INT_DECODE_ARRAY[i]);
return values;
}

View File

@ -34,10 +34,23 @@ import java.util.*;
/**
* A builder class for genotypes
*
* Provides convenience setter methods for all of the Genotype field
* values. Setter methods can be used in any order, allowing you to
* pass through states that wouldn't be allowed in the highly regulated
* immutable Genotype class.
*
* All fields default to meaningful MISSING values.
*
* Call make() to actually create the corresponding Genotype object from
* this builder. Can be called multiple times to create independent copies,
* or with intervening sets to conveniently make similar Genotypes with
* slight modifications.
*
* @author Mark DePristo
* @since 06/12
*/
public final class GenotypeBuilder {
public static boolean MAKE_FAST_BY_DEFAULT = false;
public static boolean MAKE_FAST_BY_DEFAULT = true;
private String sampleName = null;
private List<Allele> alleles = null;
@ -48,6 +61,7 @@ public final class GenotypeBuilder {
private int[] AD = null;
private int[] PL = null;
private Map<String, Object> extendedAttributes = null;
private int initialAttributeMapSize = 5;
private boolean useFast = MAKE_FAST_BY_DEFAULT;
@ -191,6 +205,11 @@ public final class GenotypeBuilder {
return this;
}
/**
* Set this genotype's name
* @param sampleName
* @return
*/
@Requires({"sampleName != null"})
@Ensures({"this.sampleName != null"})
public GenotypeBuilder name(final String sampleName) {
@ -198,6 +217,11 @@ public final class GenotypeBuilder {
return this;
}
/**
* Set this genotype's alleles
* @param alleles
* @return
*/
@Ensures({"this.alleles != null"})
public GenotypeBuilder alleles(final List<Allele> alleles) {
if ( alleles == null )
@ -207,6 +231,11 @@ public final class GenotypeBuilder {
return this;
}
/**
* Is this genotype phased?
* @param phased
* @return
*/
public GenotypeBuilder phased(final boolean phased) {
isPhased = phased;
return this;
@ -235,11 +264,34 @@ public final class GenotypeBuilder {
return GQ((int)Math.round(pLog10Error * -10));
}
/**
* This genotype has no GQ value
* @return
*/
public GenotypeBuilder noGQ() { GQ = -1; return this; }
/**
* This genotype has no AD value
* @return
*/
public GenotypeBuilder noAD() { AD = null; return this; }
/**
* This genotype has no DP value
* @return
*/
public GenotypeBuilder noDP() { DP = -1; return this; }
/**
* This genotype has no PL value
* @return
*/
public GenotypeBuilder noPL() { PL = null; return this; }
/**
* This genotype has this DP value
* @return
*/
@Requires({"DP >= -1"})
@Ensures({"this.DP == DP"})
public GenotypeBuilder DP(final int DP) {
@ -247,6 +299,10 @@ public final class GenotypeBuilder {
return this;
}
/**
* This genotype has this AD value
* @return
*/
@Requires({"AD == null || AD.length > 0"})
@Ensures({"this.AD == AD"})
public GenotypeBuilder AD(final int[] AD) {
@ -254,6 +310,10 @@ public final class GenotypeBuilder {
return this;
}
/**
* This genotype has this PL value, as int[]. FAST
* @return
*/
@Requires("PL == null || PL.length > 0")
@Ensures({"this.PL == PL"})
public GenotypeBuilder PL(final int[] PL) {
@ -261,6 +321,10 @@ public final class GenotypeBuilder {
return this;
}
/**
* This genotype has this PL value, converted from double[]. SLOW
* @return
*/
@Requires("PL == null || PL.length > 0")
@Ensures({"this.PL == PL"})
public GenotypeBuilder PL(final double[] GLs) {
@ -268,6 +332,12 @@ public final class GenotypeBuilder {
return this;
}
/**
* This genotype has these attributes.
*
* Cannot contain inline attributes (DP, AD, GQ, PL)
* @return
*/
@Requires("attributes != null")
@Ensures("attributes.isEmpty() || extendedAttributes != null")
public GenotypeBuilder attributes(final Map<String, Object> attributes) {
@ -276,11 +346,17 @@ public final class GenotypeBuilder {
return this;
}
/**
* This genotype has this attribute key / value pair.
*
* Cannot contain inline attributes (DP, AD, GQ, PL)
* @return
*/
@Requires({"key != null"})
@Ensures({"extendedAttributes != null", "extendedAttributes.containsKey(key)"})
public GenotypeBuilder attribute(final String key, final Object value) {
if ( extendedAttributes == null )
extendedAttributes = new HashMap<String, Object>(5);
extendedAttributes = new HashMap<String, Object>(initialAttributeMapSize);
extendedAttributes.put(key, value);
return this;
}
@ -299,8 +375,34 @@ public final class GenotypeBuilder {
return this;
}
/**
* varargs version of #filters
* @param filters
* @return
*/
@Requires("filters != null")
public GenotypeBuilder filters(final String ... filters) {
return filters(Arrays.asList(filters));
}
/**
* This genotype is unfiltered
*
* @return
*/
@Requires("filters != null")
public GenotypeBuilder unfiltered() {
if ( extendedAttributes != null )
extendedAttributes.remove(VCFConstants.GENOTYPE_FILTER_KEY);
return this;
}
/**
* Tell's this builder that we have at most these number of attributes
* @return
*/
public GenotypeBuilder maxAttributes(final int i) {
// TODO
initialAttributeMapSize = i;
return this;
}
}

View File

@ -48,6 +48,7 @@ public class GenotypeLikelihoods {
return new GenotypeLikelihoods(PLs);
}
@Deprecated
public final static GenotypeLikelihoods fromGLField(String GLs) {
return new GenotypeLikelihoods(parseDeprecatedGLString(GLs));
}

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.variantcontext.writer;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec;
@ -39,7 +41,6 @@ import java.io.*;
import java.util.*;
class BCF2Writer extends IndexingVariantContextWriter {
private final static boolean FULLY_DECODE = true;
final protected static Logger logger = Logger.getLogger(BCF2Writer.class);
private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support
@ -63,12 +64,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
//
// --------------------------------------------------------------------------------
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
int offset = 0;
for ( VCFContigHeaderLine contig : contigLines )
contigDictionary.put(contig.getID(), offset++);
}
@Override
public void writeHeader(final VCFHeader header) {
// create the config offsets map
@ -103,8 +98,11 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
@Override
public void add( final VariantContext initialVC ) {
final VariantContext vc = FULLY_DECODE ? initialVC.fullyDecode(header) : initialVC;
public void add( VariantContext vc ) {
if ( doNotWriteGenotypes )
vc = new VariantContextBuilder(vc).noGenotypes().make();
vc = vc.fullyDecode(header);
super.add(vc); // allow on the fly indexing
try {
@ -253,9 +251,13 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
}
// TODO TODO TODO TODO TODO
// TODO
// TODO -- we really need explicit converters as first class objects
// TODO -- need to generalize so we can enable vectors of compressed genotype ints
// TODO -- no sense in allocating these over and over
// TODO
// TODO TODO TODO TODO TODO
private final VCFToBCFEncoding prepFieldValueForEncoding(final String field, final Object value) {
final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, field);
final boolean isList = value instanceof List;
@ -269,8 +271,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
case Flag:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1));
case String:
final List<String> s = isList ? (List<String>)value : Collections.singletonList((String) value);
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s);
final String s = isList ? BCF2Utils.collapseStringList((List<String>)value) : (String)value;
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(s));
case Integer: // note integer calculation is a bit complex because of the need to determine sizes
List<Integer> l;
BCF2Type intType;
@ -297,19 +299,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
}
private final void addGenotypeFilters(final VariantContext vc) throws IOException {
logger.warn("Skipping genotype filter field");
// // TODO -- FIXME -- string is wrong here -- need to compute string size...
// startGenotypeField(VCFConstants.GENOTYPE_FILTER_KEY, 1, BCFType.CHAR);
// for ( final Genotype g : vc.getGenotypes() ) {
// if ( g.filtersWereApplied() && g.isFiltered() ) {
// encoder.encodeString(ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())));
// } else {
// encoder.encodeRawMissingValues(1, BCFType.CHAR); // todo fixme
// }
// }
}
// --------------------------------------------------------------------------------
//
// Genotype field encoding
@ -319,10 +308,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
private byte[] buildSamplesData(final VariantContext vc) throws IOException {
BCF2Codec.LazyData lazyData = getLazyData(vc);
if ( lazyData != null ) {
logger.info("Lazy decoded data detected, writing bytes directly to stream. N = " + lazyData.bytes.length);
return lazyData.bytes;
} else {
final GenotypesContext gc = vc.getGenotypes();
// we have to do work to convert the VC into a BCF2 byte stream
List<String> genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header);
for ( final String field : genotypeFields ) {
@ -330,8 +317,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
addGenotypes(vc);
} else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) {
addIntGenotypeField(field, intGenotypeFieldAccessors.getAccessor(field), vc);
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
addGenotypeFilters(vc);
} else {
addGenericGenotypeField(vc, field);
}
@ -388,6 +373,9 @@ class BCF2Writer extends IndexingVariantContextWriter {
if ( numInFormatField == 1 ) {
// we encode the actual allele, encodeRawValue handles the missing case where fieldValue == null
encoder.encodeRawValue(fieldValue, encoding.BCF2Type);
} else if ( encoding.vcfType == VCFHeaderLineType.String ) {
// we have multiple strings, so we need to collapse them
throw new ReviewedStingException("LIMITATION: Cannot encode arrays of string values in genotypes");
} else {
// multiple values, need to handle general case
final List<Object> asList = toList(fieldValue);
@ -402,6 +390,12 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
}
// TODO TODO TODO TODO TODO
// TODO
// TODO THIS ROUTINE NEEDS TO BE OPTIMIZED. IT ACCOUNTS FOR A SIGNIFICANT AMOUNT OF THE
// TODO RUNTIME FOR WRITING OUT BCF FILES WITH MANY GENOTYPES
// TODO
// TODO TODO TODO TODO TODO
private final void addIntGenotypeField(final String field,
final IntGenotypeFieldAccessors.Accessor ige,
final VariantContext vc) throws IOException {
@ -429,13 +423,19 @@ class BCF2Writer extends IndexingVariantContextWriter {
encoder.encodePrimitive(pl == -1 ? type.getMissingBytes() : pl, type);
}
// TODO TODO TODO TODO TODO
// TODO
// TODO we should really have a fast path for encoding diploid genotypes where
// TODO we don't pay the overhead of creating the allele maps
// TODO
// TODO TODO TODO TODO TODO
private final void addGenotypes(final VariantContext vc) throws IOException {
if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES )
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
"with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have "
+ vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart());
final Map<Allele, String> alleleMap = VCFWriter.buildAlleleMap(vc);
final Map<Allele, Integer> alleleMap = buildAlleleMap(vc);
final int maxPloidy = vc.getMaxPloidy();
startGenotypeField(VCFConstants.GENOTYPE_KEY, maxPloidy, BCF2Type.INT8);
for ( final String name : header.getGenotypeSamples() ) {
@ -446,7 +446,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
if ( i < samplePloidy ) {
// we encode the actual allele
final Allele a = alleles.get(i);
final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a));
final int offset = alleleMap.get(a);
final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00);
encoder.encodePrimitive(encoded, BCF2Type.INT8);
} else {
@ -457,6 +457,18 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
}
private final static Map<Allele, Integer> buildAlleleMap(final VariantContext vc) {
final Map<Allele, Integer> alleleMap = new HashMap<Allele, Integer>(vc.getAlleles().size()+1);
alleleMap.put(Allele.NO_CALL, -1); // convenience for lookup
final List<Allele> alleles = vc.getAlleles();
for ( int i = 0; i < alleles.size(); i++ ) {
alleleMap.put(alleles.get(i), i);
}
return alleleMap;
}
// --------------------------------------------------------------------------------
//
// Low-level block encoding
@ -470,24 +482,26 @@ class BCF2Writer extends IndexingVariantContextWriter {
*
* @throws IOException
*/
@Requires({"infoBlock.length > 0", "genotypesBlock.length >= 0"})
private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException {
assert infoBlock.length > 0;
assert genotypesBlock.length >= 0;
BCF2Encoder.encodePrimitive(infoBlock.length, BCF2Type.INT32, outputStream);
BCF2Encoder.encodePrimitive(genotypesBlock.length, BCF2Type.INT32, outputStream);
outputStream.write(infoBlock);
outputStream.write(genotypesBlock);
}
// TODO -- obvious optimization case
@Requires("string != null")
@Ensures("BCF2Type.INTEGERS.contains(result)")
private final BCF2Type encodeStringByRef(final String string) throws IOException {
assert string != null;
return encodeStringsByRef(Collections.singletonList(string));
final Integer offset = stringDictionaryMap.get(string);
if ( offset == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF");
final BCF2Type type = encoder.determineIntegerType(offset);
encoder.encodeTyped(offset, type);
return type;
}
// TODO -- in size == 1 case branch to singleoton fast-path
@Requires("! strings.isEmpty()")
@Ensures("BCF2Type.INTEGERS.contains(result)")
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
assert ! strings.isEmpty();
@ -517,17 +531,38 @@ class BCF2Writer extends IndexingVariantContextWriter {
return maxType;
}
@Requires({"key != null", "! key.equals(\"\")", "size >= 0"})
private final void startGenotypeField(final String key, final int size, final BCF2Type valueType) throws IOException {
assert key != null && ! key.equals("");
assert size >= 0;
encodeStringByRef(key);
encoder.encodeType(size, valueType);
}
/**
* Helper function that takes an object and returns a list representation
* of it:
*
* o == null => []
* o is a list => o
* else => [o]
*
* @param o
* @return
*/
private final static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
/**
* Create the contigDictionary from the contigLines extracted from the VCF header
*
* @param contigLines
*/
@Requires("contigDictionary.isEmpty()")
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
int offset = 0;
for ( VCFContigHeaderLine contig : contigLines )
contigDictionary.put(contig.getID(), offset++);
}
}

View File

@ -251,7 +251,7 @@ class VCFWriter extends IndexingVariantContextWriter {
}
}
public static Map<Allele, String> buildAlleleMap(final VariantContext vc) {
private static Map<Allele, String> buildAlleleMap(final VariantContext vc) {
final Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()+1);
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup

View File

@ -161,9 +161,13 @@ public class VariantContextTestProvider {
metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFInfoHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"));
metaData.add(new VCFInfoHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"));
metaData.add(new VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
metaData.add(new VCFFormatHeaderLine("GS", 2, VCFHeaderLineType.String, "A doubleton list of strings"));
metaData.add(new VCFFormatHeaderLine("GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "A variable list of strings"));
metaData.add(new VCFFormatHeaderLine("FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype filters"));
// prep the header
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
@ -386,7 +390,53 @@ public class VariantContextTestProvider {
attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0),
attr("g2", ref, "FLOAT3"));
// test test Integer, Float, Flag, String atomic, vector, and missing types of different lengths per sample
//
//
// TESTING MULTIPLE SIZED LISTS IN THE GENOTYPE FIELD
//
//
addGenotypeTests(site,
attr("g1", ref, "GS", Arrays.asList("S1", "S2")),
attr("g2", ref, "GS", Arrays.asList("S3", "S4")));
addGenotypeTests(site, // g1 is missing the string, and g2 is missing FLOAT1
attr("g1", ref, "FLOAT1", 1.0),
attr("g2", ref, "GS", Arrays.asList("S3", "S4")));
// variable sized lists
addGenotypeTests(site,
attr("g1", ref, "GV", Arrays.asList("S1")),
attr("g2", ref, "GV", Arrays.asList("S3", "S4")));
addGenotypeTests(site,
attr("g1", ref, "GV", Arrays.asList("S1", "S2")),
attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5")));
addGenotypeTests(site, // missing value in varlist of string
attr("g1", ref, "FLOAT1", 1.0),
attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5")));
//
//
// TESTING GENOTYPE FILTERS
//
//
addGenotypeTests(site,
new GenotypeBuilder("g1", Arrays.asList(ref, ref)).filters("X").make(),
new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make());
addGenotypeTests(site,
new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(),
new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make());
addGenotypeTests(site,
new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(),
new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X", "Y").make());
addGenotypeTests(site,
new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(),
new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make(),
new GenotypeBuilder("g3", Arrays.asList(ref, ref)).filters("X", "Y").make());
// TODO -- test test Integer, Float, Flag, String atomic, vector, and missing types of different lengths per sample
}
private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) {
@ -493,7 +543,27 @@ public class VariantContextTestProvider {
Assert.assertEquals(actual.getSampleName(), expected.getSampleName());
Assert.assertEquals(actual.getAlleles(), expected.getAlleles());
Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString());
Assert.assertEquals(actual.getType(), expected.getType());
// filters are the same
Assert.assertEquals(actual.getFilters(), expected.getFilters());
Assert.assertEquals(actual.isFiltered(), expected.isFiltered());
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied());
// inline attributes
Assert.assertEquals(actual.getDP(), expected.getDP());
Assert.assertEquals(actual.getAD(), expected.getAD());
Assert.assertEquals(actual.getGQ(), expected.getGQ());
Assert.assertEquals(actual.hasPL(), expected.hasPL());
Assert.assertEquals(actual.hasAD(), expected.hasAD());
Assert.assertEquals(actual.hasGQ(), expected.hasGQ());
Assert.assertEquals(actual.hasDP(), expected.hasDP());
Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods());
Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString());
Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods());
Assert.assertEquals(actual.getPL(), expected.getPL());
Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual());
assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes());
Assert.assertEquals(actual.isPhased(), expected.isPhased());