Finalizing BCF2 v2
-- FastGenotypes are the default in the engine. Use --useSlowGenotypes engine argument to return to old representation
-- Cleanup of BCF2Codec. Good error handling. Added contracts and docs.
-- Added a few more contacts and docs to BCF2Decoder
-- Optimized encodePrimitive in BCF2Encoder
-- Removed genotype filter field exceptions
-- Docs and cleanup of BCF2GenotypeFieldDecoders
-- Deleted unused BCF2TestWalker
-- Docs and cleanup of BCF2Types
-- Faster version of decodeInts in VCFCodec
-- BCF2Writer
-- Support for writing a sites only file
-- Lots of TODOs for future optimizations
-- Removed lack of filter field support
-- No longer uses the alleleMap from VCFWriter, which was a Allele -> String, now uses Allele -> Integer which is faster and more natural
-- Lots of docs and contracts
-- Docs for GenotypeBuilder. More filter creation routines (unfiltered, for example)
-- More extensive tests in VariantContextTestProfiler, including variable length strings in genotypes and genotype filters. Better genotype comparisons
This commit is contained in:
parent
938d3ada32
commit
43ad890fcc
|
|
@ -223,8 +223,8 @@ public class GenomeAnalysisEngine {
|
|||
resetRandomGenerator(System.currentTimeMillis());
|
||||
|
||||
// TODO -- REMOVE ME WHEN WE STOP BCF testing
|
||||
if ( this.getArguments().USE_FAST_GENOTYPES )
|
||||
GenotypeBuilder.MAKE_FAST_BY_DEFAULT = true;
|
||||
if ( this.getArguments().USE_SLOW_GENOTYPES )
|
||||
GenotypeBuilder.MAKE_FAST_BY_DEFAULT = false;
|
||||
|
||||
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
|
||||
if (this.getArguments().BQSR_RECAL_FILE != null)
|
||||
|
|
|
|||
|
|
@ -336,9 +336,9 @@ public class GATKArgumentCollection {
|
|||
public boolean generateShadowBCF = false;
|
||||
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
|
||||
|
||||
@Argument(fullName="useFastGenotypes",shortName = "useFastGenotypes",doc="",required=false)
|
||||
@Argument(fullName="useSlowGenotypes",shortName = "useSlowGenotypes",doc="",required=false)
|
||||
@Hidden
|
||||
public boolean USE_FAST_GENOTYPES = false;
|
||||
public boolean USE_SLOW_GENOTYPES = false;
|
||||
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -24,6 +24,8 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
|
|
@ -43,16 +45,45 @@ import java.io.FileNotFoundException;
|
|||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
|
||||
/**
|
||||
* Decode BCF2 files
|
||||
*/
|
||||
public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
|
||||
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
|
||||
private VCFHeader header = null;
|
||||
|
||||
/**
|
||||
* Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field
|
||||
*/
|
||||
private final ArrayList<String> contigNames = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Maps header string names (encoded in VCF) into strings found in the BCF header
|
||||
*
|
||||
* Initialized when processing the header
|
||||
*/
|
||||
private ArrayList<String> dictionary;
|
||||
|
||||
/**
|
||||
* Our decoder that reads low-level objects from the BCF2 records
|
||||
*/
|
||||
private final BCF2Decoder decoder = new BCF2Decoder();
|
||||
private boolean skipGenotypes = false;
|
||||
|
||||
/**
|
||||
* Provides some sanity checking on the header
|
||||
*/
|
||||
private final static int MAX_HEADER_SIZE = 0x08000000;
|
||||
|
||||
/**
|
||||
* Genotype field decoders that are initialized when the header is read
|
||||
*/
|
||||
private BCF2GenotypeFieldDecoders gtFieldDecoders = null;
|
||||
|
||||
// for error handling
|
||||
private int recordNo = 0;
|
||||
private int pos = 0;
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Feature codec interface functions
|
||||
|
|
@ -61,28 +92,30 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
|
||||
@Override
|
||||
public Feature decodeLoc( final PositionalBufferedStream inputStream ) {
|
||||
return decode(inputStream);
|
||||
// TODO: a less expensive version of decodeLoc() that doesn't use VariantContext
|
||||
// TODO: very easy -- just decodeSitesBlock, and then skip to end of end of sites block
|
||||
// TODO: and then skip genotypes block
|
||||
recordNo++;
|
||||
final VariantContextBuilder builder = new VariantContextBuilder();
|
||||
|
||||
final int sitesBlockSize = decoder.readBlockSize(inputStream);
|
||||
final int genotypeBlockSize = decoder.readBlockSize(inputStream); // necessary because it's in the stream
|
||||
decoder.readNextBlock(sitesBlockSize, inputStream);
|
||||
decodeSiteLoc(builder);
|
||||
|
||||
return builder.fullyDecoded(true).make();
|
||||
}
|
||||
|
||||
@Override
|
||||
public VariantContext decode( final PositionalBufferedStream inputStream ) {
|
||||
recordNo++;
|
||||
final VariantContextBuilder builder = new VariantContextBuilder();
|
||||
|
||||
final int sitesBlockSize = decoder.readBlockSize(inputStream);
|
||||
final int genotypeBlockSize = decoder.readBlockSize(inputStream);
|
||||
decoder.readNextBlock(sitesBlockSize, inputStream);
|
||||
final SitesInfoForDecoding info = decodeSitesBlock(builder);
|
||||
|
||||
if ( isSkippingGenotypes() ) {
|
||||
decoder.skipNextBlock(genotypeBlockSize, inputStream);
|
||||
} else {
|
||||
decoder.readNextBlock(genotypeBlockSize, inputStream);
|
||||
createLazyGenotypesDecoder(info, builder);
|
||||
}
|
||||
decodeSiteLoc(builder);
|
||||
final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder);
|
||||
|
||||
decoder.readNextBlock(genotypeBlockSize, inputStream);
|
||||
createLazyGenotypesDecoder(info, builder);
|
||||
return builder.fullyDecoded(true).make();
|
||||
}
|
||||
|
||||
|
|
@ -96,16 +129,16 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
try {
|
||||
// note that this reads the magic as well, and so does double duty
|
||||
if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
|
||||
throw new UserException.MalformedBCF2("Input stream does not begin with BCF2 magic");
|
||||
error("Input stream does not begin with BCF2 magic");
|
||||
|
||||
final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);
|
||||
|
||||
if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
|
||||
throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
|
||||
error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
|
||||
|
||||
final byte[] headerBytes = new byte[headerSizeInBytes];
|
||||
if ( inputStream.read(headerBytes) != headerSizeInBytes )
|
||||
throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
|
||||
error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
|
||||
|
||||
final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
|
||||
final AsciiLineReader headerReader = new AsciiLineReader(bps);
|
||||
|
|
@ -120,8 +153,11 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
if ( ! header.getContigLines().isEmpty() ) {
|
||||
logger.info("Found contig lines in BCF2 file, using those");
|
||||
contigNames.clear();
|
||||
for ( final VCFContigHeaderLine contig : header.getContigLines())
|
||||
for ( final VCFContigHeaderLine contig : header.getContigLines()) {
|
||||
if ( contig.getID() == null || contig.getID().equals("") )
|
||||
error("found a contig with an invalid ID " + contig);
|
||||
contigNames.add(contig.getID());
|
||||
}
|
||||
} else {
|
||||
logger.info("Didn't find any contig lines in BCF2 file, falling back (dangerously) to GATK reference dictionary");
|
||||
}
|
||||
|
|
@ -161,7 +197,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@Override
|
||||
public void setGenomeLocParser(final GenomeLocParser genomeLocParser) {
|
||||
// initialize contigNames to standard ones in reference
|
||||
|
|
@ -169,14 +204,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
contigNames.add(contig.getSequenceName());
|
||||
}
|
||||
|
||||
public boolean isSkippingGenotypes() {
|
||||
return skipGenotypes;
|
||||
}
|
||||
|
||||
public void setSkipGenotypes(final boolean skipGenotypes) {
|
||||
this.skipGenotypes = skipGenotypes;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// implicit block
|
||||
|
|
@ -190,16 +217,33 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) {
|
||||
/**
|
||||
* Decode the sites level data from this classes decoder
|
||||
*
|
||||
* @param builder
|
||||
* @return
|
||||
*/
|
||||
@Requires({"builder != null"})
|
||||
private final void decodeSiteLoc(final VariantContextBuilder builder) {
|
||||
final int contigOffset = decoder.decodeInt(BCF2Type.INT32);
|
||||
final String contig = lookupContigName(contigOffset);
|
||||
builder.chr(contig);
|
||||
|
||||
final int pos = decoder.decodeInt(BCF2Type.INT32);
|
||||
this.pos = decoder.decodeInt(BCF2Type.INT32);
|
||||
final int refLength = decoder.decodeInt(BCF2Type.INT32);
|
||||
builder.start((long)pos);
|
||||
builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the sites level data from this classes decoder
|
||||
*
|
||||
* @param builder
|
||||
* @return
|
||||
*/
|
||||
@Requires({"builder != null", "decoder != null"})
|
||||
@Ensures({"result != null", "result.isValid()"})
|
||||
private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) {
|
||||
final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT);
|
||||
if ( qual != null ) {
|
||||
builder.log10PError(((Double)qual) / -10.0);
|
||||
|
|
@ -217,23 +261,39 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
decodeFilter(builder);
|
||||
decodeInfo(builder, nInfo);
|
||||
|
||||
return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles);
|
||||
final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles);
|
||||
if ( ! info.isValid() )
|
||||
error("Sites info is malformed: " + info);
|
||||
return info;
|
||||
}
|
||||
|
||||
protected final static class SitesInfoForDecoding {
|
||||
final int pos;
|
||||
final int nFormatFields;
|
||||
final int nSamples;
|
||||
final ArrayList<Allele> alleles;
|
||||
|
||||
private SitesInfoForDecoding(final int pos, final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
|
||||
this.pos = pos;
|
||||
private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
|
||||
this.nFormatFields = nFormatFields;
|
||||
this.nSamples = nSamples;
|
||||
this.alleles = alleles;
|
||||
}
|
||||
|
||||
public boolean isValid() {
|
||||
return nFormatFields >= 0 &&
|
||||
nSamples >= 0 &&
|
||||
alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("nFormatFields = %d, nSamples = %d, alleles = %s", nFormatFields, nSamples, alleles);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the id field in this BCF2 file and store it in the builder
|
||||
* @param builder
|
||||
*/
|
||||
private void decodeID( final VariantContextBuilder builder ) {
|
||||
final String id = (String)decoder.decodeTypedValue();
|
||||
|
||||
|
|
@ -243,6 +303,15 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
builder.id(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Annoying routine that deals with allele clipping from the BCF2 encoding to the standard
|
||||
* GATK encoding.
|
||||
*
|
||||
* @param position
|
||||
* @param ref
|
||||
* @param unclippedAlleles
|
||||
* @return
|
||||
*/
|
||||
protected static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
|
||||
if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
|
||||
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
|
||||
|
|
@ -252,6 +321,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
return unclippedAlleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the alleles from this BCF2 file and put the results in builder
|
||||
* @param builder
|
||||
* @param pos
|
||||
* @param nAlleles
|
||||
* @return the alleles
|
||||
*/
|
||||
@Requires("nAlleles > 0")
|
||||
private ArrayList<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) {
|
||||
// TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>(nAlleles);
|
||||
|
|
@ -267,15 +344,21 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
alleles.add(Allele.create(allele, false));
|
||||
}
|
||||
}
|
||||
assert ref != null;
|
||||
|
||||
alleles = clipAllelesIfNecessary(pos, ref, alleles);
|
||||
builder.alleles(alleles);
|
||||
|
||||
assert ref.length() > 0;
|
||||
builder.referenceBaseForIndel(ref.getBytes()[0]);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the filter field of this BCF2 file and store the result in the builder
|
||||
* @param builder
|
||||
*/
|
||||
private void decodeFilter( final VariantContextBuilder builder ) {
|
||||
final Object value = decoder.decodeTypedValue();
|
||||
|
||||
|
|
@ -283,17 +366,28 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
builder.unfiltered();
|
||||
else {
|
||||
if ( value instanceof Integer )
|
||||
// fast path for single integer result
|
||||
builder.filter(getDictionaryString((Integer)value));
|
||||
else {
|
||||
for ( int offset : (List<Integer>)value )
|
||||
for ( final int offset : (List<Integer>)value )
|
||||
builder.filter(getDictionaryString(offset));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over the info field key / value pairs in this BCF2 file and decode them into the builder
|
||||
*
|
||||
* @param builder
|
||||
* @param numInfoFields
|
||||
*/
|
||||
@Requires("numInfoFields >= 0")
|
||||
private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) {
|
||||
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
|
||||
if ( numInfoFields == 0 )
|
||||
// fast path, don't bother doing any work if there are no fields
|
||||
return;
|
||||
|
||||
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
|
||||
for ( int i = 0; i < numInfoFields; i++ ) {
|
||||
final String key = getDictionaryString();
|
||||
Object value = decoder.decodeTypedValue();
|
||||
|
|
@ -340,47 +434,63 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
final public int nGenotypeFields;
|
||||
final public byte[] bytes;
|
||||
|
||||
@Requires({"nGenotypeField > 0", "bytes != null"})
|
||||
public LazyData(final int nGenotypeFields, final byte[] bytes) {
|
||||
this.nGenotypeFields = nGenotypeFields;
|
||||
this.bytes = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
private final String getDictionaryString() {
|
||||
return getDictionaryString((Integer) decoder.decodeTypedValue());
|
||||
}
|
||||
|
||||
@Requires("offset >= dictionary.size()")
|
||||
@Ensures("result != null")
|
||||
protected final String getDictionaryString(final int offset) {
|
||||
if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset);
|
||||
final String field = dictionary.get(offset);
|
||||
return field;
|
||||
return dictionary.get(offset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Translate the config offset as encoded in the BCF file into the actual string
|
||||
* name of the contig from the dictionary
|
||||
*
|
||||
* @param contigOffset
|
||||
* @return
|
||||
*/
|
||||
@Requires({"contigOffset >= 0", "contigOffset < contigNames.size()"})
|
||||
@Ensures("result != null")
|
||||
private final String lookupContigName( final int contigOffset ) {
|
||||
if ( contigOffset < contigNames.size() ) {
|
||||
return contigNames.get(contigOffset);
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
|
||||
}
|
||||
return contigNames.get(contigOffset);
|
||||
}
|
||||
|
||||
|
||||
@Requires("header != null")
|
||||
@Ensures({"result != null", "! result.isEmpty()"})
|
||||
private final ArrayList<String> parseDictionary(final VCFHeader header) {
|
||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||
|
||||
// if we got here we never found a dictionary, or there are no elements in the dictionary
|
||||
if ( dict.size() == 0 )
|
||||
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
|
||||
if ( dict.isEmpty() )
|
||||
error("Dictionary header element was absent or empty");
|
||||
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the VCFHeader we found in this BCF2 file
|
||||
*/
|
||||
protected VCFHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
@Requires("field != null")
|
||||
@Ensures("result != null")
|
||||
protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) {
|
||||
return gtFieldDecoders.getDecoder(field);
|
||||
}
|
||||
|
||||
private final void error(final String message) throws RuntimeException {
|
||||
throw new UserException.MalformedBCF2(String.format("At record %d with position %d:", recordNo, pos, message));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,8 +40,8 @@ import java.util.Arrays;
|
|||
public final class BCF2Decoder {
|
||||
final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
|
||||
|
||||
byte[] recordBytes;
|
||||
ByteArrayInputStream recordStream;
|
||||
byte[] recordBytes = null;
|
||||
ByteArrayInputStream recordStream = null;
|
||||
|
||||
public BCF2Decoder() {
|
||||
// nothing to do
|
||||
|
|
@ -69,6 +69,7 @@ public final class BCF2Decoder {
|
|||
* @return
|
||||
*/
|
||||
public void readNextBlock(final int blockSizeInBytes, final InputStream stream) {
|
||||
if ( blockSizeInBytes < 0 ) throw new UserException.MalformedBCF2("Invalid block size " + blockSizeInBytes);
|
||||
setRecordBytes(readRecordBytes(blockSizeInBytes, stream));
|
||||
}
|
||||
|
||||
|
|
@ -115,9 +116,9 @@ public final class BCF2Decoder {
|
|||
*
|
||||
* @param recordBytes
|
||||
*/
|
||||
@Requires("recordBytes != null")
|
||||
@Ensures({"this.recordBytes == recordBytes", "recordStream != null"})
|
||||
public void setRecordBytes(final byte[] recordBytes) {
|
||||
assert recordBytes != null;
|
||||
|
||||
this.recordBytes = recordBytes;
|
||||
this.recordStream = new ByteArrayInputStream(recordBytes);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ import java.io.OutputStream;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Simple BCF2 encoder
|
||||
* BCF2 encoder
|
||||
*
|
||||
* @author depristo
|
||||
* @since 5/12
|
||||
|
|
@ -255,12 +255,30 @@ public final class BCF2Encoder {
|
|||
}
|
||||
|
||||
public final static void encodePrimitive(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException {
|
||||
for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
|
||||
final int shift = i * 8;
|
||||
int mask = 0xFF << shift;
|
||||
int byteValue = (mask & value) >> shift;
|
||||
encodeStream.write(byteValue);
|
||||
switch ( type.getSizeInBytes() ) {
|
||||
case 1:
|
||||
encodeStream.write(0xFF & value);
|
||||
break;
|
||||
case 2:
|
||||
encodeStream.write((0xFF00 & value) >> 8);
|
||||
encodeStream.write(0xFF & value);
|
||||
break;
|
||||
case 4:
|
||||
encodeStream.write((0xFF000000 & value) >> 24);
|
||||
encodeStream.write((0x00FF0000 & value) >> 16);
|
||||
encodeStream.write((0x0000FF00 & value) >> 8);
|
||||
encodeStream.write((0x000000FF & value));
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("BUG: unexpected type size " + type);
|
||||
}
|
||||
// general case for reference
|
||||
// for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
|
||||
// final int shift = i * 8;
|
||||
// int mask = 0xFF << shift;
|
||||
// int byteValue = (mask & value) >> shift;
|
||||
// encodeStream.write(byteValue);
|
||||
// }
|
||||
}
|
||||
|
||||
private final List<Byte> stringToBytes(final String v) throws IOException {
|
||||
|
|
|
|||
|
|
@ -29,17 +29,22 @@ import com.google.java.contract.Requires;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* An efficient
|
||||
* An efficient scheme for building and obtaining specialized
|
||||
* genotype field decoders. Used by the BCFCodec to parse
|
||||
* with little overhead the fields from BCF2 encoded genotype
|
||||
* records
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
* @author Mark DePristo
|
||||
* @since 6/12
|
||||
*/
|
||||
public class BCF2GenotypeFieldDecoders {
|
||||
final protected static Logger logger = Logger.getLogger(BCF2GenotypeFieldDecoders.class);
|
||||
|
|
@ -54,7 +59,8 @@ public class BCF2GenotypeFieldDecoders {
|
|||
// TODO -- fill in appropriate decoders for each FORMAT field in the header
|
||||
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FLDecoder());
|
||||
// currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new GenericDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder());
|
||||
|
|
@ -246,13 +252,6 @@ public class BCF2GenotypeFieldDecoders {
|
|||
}
|
||||
}
|
||||
|
||||
private class FLDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
|
||||
throw new ReviewedStingException("Genotype filter not implemented in BCF2 yet");
|
||||
}
|
||||
}
|
||||
|
||||
private class GenericDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
|
||||
|
|
@ -271,16 +270,4 @@ public class BCF2GenotypeFieldDecoders {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final int[] decodeIntArray(final Object value) {
|
||||
// todo -- decode directly into int[]
|
||||
final List<Integer> pls = (List<Integer>)value;
|
||||
if ( pls != null ) { // we have a PL field
|
||||
final int[] x = new int[pls.size()];
|
||||
for ( int j = 0; j < x.length; j++ )
|
||||
x[j] = pls.get(j);
|
||||
return x;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,143 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Testing BCF2
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 2012
|
||||
*/
|
||||
public class BCF2TestWalker extends RodWalker<Integer, Integer> {
|
||||
/**
|
||||
* Variants from this VCF file are used by this tool as input.
|
||||
* The file must at least contain the standard VCF header lines, but
|
||||
* can be empty (i.e., no variants are contained in the file).
|
||||
*/
|
||||
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
|
||||
public RodBinding<VariantContext> variants;
|
||||
|
||||
@Argument(doc="keep variants", required=false)
|
||||
public boolean keepVariants = false;
|
||||
|
||||
@Argument(doc="quiet", required=false)
|
||||
public boolean quiet = false;
|
||||
|
||||
@Argument(doc="dontIndexOnTheFly", required=false)
|
||||
public boolean dontIndexOnTheFly = false;
|
||||
|
||||
@Output(doc="File to which results should be written",required=true)
|
||||
protected File bcfFile;
|
||||
|
||||
private final List<VariantContext> vcs = new ArrayList<VariantContext>();
|
||||
protected VariantContextWriter writer;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
final Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Collections.singletonList(variants));
|
||||
final VCFHeader header = VCFUtils.withUpdatedContigs(vcfRods.values().iterator().next(), getToolkit());
|
||||
try {
|
||||
EnumSet<Options> options = EnumSet.of(Options.FORCE_BCF);
|
||||
if ( !dontIndexOnTheFly ) options.add(Options.INDEX_ON_THE_FLY);
|
||||
writer = VariantContextWriterFactory.create(bcfFile, new FileOutputStream(bcfFile), getToolkit().getMasterSequenceDictionary(), options);
|
||||
writer.writeHeader(header);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(bcfFile, e);
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||
return 0;
|
||||
|
||||
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
|
||||
writer.add(vc);
|
||||
if ( keepVariants ) vcs.add(vc);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
//
|
||||
// default reduce -- doesn't do anything at all
|
||||
//
|
||||
public Integer reduceInit() { return 0; }
|
||||
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
try {
|
||||
writer.close();
|
||||
logger.info("Closed writer");
|
||||
|
||||
// read in the BCF records
|
||||
BCF2Codec codec = new BCF2Codec();
|
||||
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
|
||||
FeatureCodecHeader header = codec.readHeader(pbs);
|
||||
pbs.close();
|
||||
|
||||
pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
|
||||
pbs.skip(header.getHeaderEnd());
|
||||
Iterator<VariantContext> it = vcs.iterator();
|
||||
while ( ! pbs.isDone() ) {
|
||||
if ( keepVariants ) {
|
||||
VariantContext expected = it.next();
|
||||
if ( ! quiet )
|
||||
System.out.printf("vcf = %s %d %s%n", expected.getChr(), expected.getStart(), expected);
|
||||
}
|
||||
VariantContext bcfRaw = codec.decode(pbs);
|
||||
VariantContext bcf = new VariantContextBuilder(bcfRaw).source("variant").make();
|
||||
if ( ! quiet ) {
|
||||
System.out.printf("bcf = %s %d %s%n", bcf.getChr(), bcf.getStart(), bcf.toString());
|
||||
System.out.printf("--------------------------------------------------%n");
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( IOException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(bcfFile, "bad user!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -24,20 +24,22 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
/**
|
||||
* BCF2 types and information
|
||||
* BCF2 types and associated information
|
||||
*
|
||||
* @author depristo
|
||||
* @since 05/12
|
||||
*/
|
||||
public enum BCF2Type {
|
||||
INT8(1, 1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
|
||||
INT16(2, 2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767),
|
||||
INT32(3, 4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647),
|
||||
FLOAT(5, 4, BCF2Utils.FLOAT_MISSING_VALUE),
|
||||
CHAR(7);
|
||||
INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range
|
||||
INT16(2, 2, 0xFFFF8000, -32767, 32767),
|
||||
INT32(3, 4, 0x80000000, -2147483647, 2147483647),
|
||||
FLOAT(5, 4, 0x7F800001),
|
||||
CHAR (7);
|
||||
|
||||
private final int id;
|
||||
private final Object missingJavaValue;
|
||||
|
|
@ -62,13 +64,49 @@ public enum BCF2Type {
|
|||
this.maxValue = maxValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* How many bytes are used to represent this type on disk?
|
||||
* @return
|
||||
*/
|
||||
public int getSizeInBytes() {
|
||||
return sizeInBytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* The ID according to the BCF2 specification
|
||||
* @return
|
||||
*/
|
||||
public int getID() { return id; }
|
||||
|
||||
/**
|
||||
* Can we encode value v in this type, according to its declared range.
|
||||
*
|
||||
* Only makes sense for integer values
|
||||
*
|
||||
* @param v
|
||||
* @return
|
||||
*/
|
||||
@Requires("INTEGERS.contains(this)")
|
||||
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
|
||||
|
||||
/**
|
||||
* Return the java object (aka null) that is used to represent a missing value for this
|
||||
* type in Java
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Object getMissingJavaValue() { return missingJavaValue; }
|
||||
|
||||
/**
|
||||
* The bytes (encoded as an int) that are used to represent a missing value
|
||||
* for this type in BCF2
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getMissingBytes() { return missingBytes; }
|
||||
|
||||
/**
|
||||
* An enum set of the types that might represent Integer values
|
||||
*/
|
||||
public final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,12 +53,6 @@ public final class BCF2Utils {
|
|||
public static final int OVERFLOW_ELEMENT_MARKER = 15;
|
||||
public static final int MAX_INLINE_ELEMENTS = 14;
|
||||
|
||||
// Note that these values are prefixed by FFFFFF for convenience
|
||||
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
|
||||
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
|
||||
public static final int INT32_MISSING_VALUE = 0x80000000;
|
||||
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
|
||||
|
||||
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
|
||||
public final static BCF2Type[] ID_TO_ENUM;
|
||||
|
||||
|
|
|
|||
|
|
@ -249,11 +249,12 @@ public class VCFCodec extends AbstractVCFCodec {
|
|||
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
|
||||
}
|
||||
|
||||
private final static String[] INT_DECODE_ARRAY = new String[10000];
|
||||
private final static int[] decodeInts(final String string) {
|
||||
final String[] splits = string.split(",");
|
||||
final int[] values = new int[splits.length];
|
||||
for ( int i = 0; i < splits.length; i++ )
|
||||
values[i] = Integer.valueOf(splits[i]);
|
||||
final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ',');
|
||||
final int[] values = new int[nValues];
|
||||
for ( int i = 0; i < nValues; i++ )
|
||||
values[i] = Integer.valueOf(INT_DECODE_ARRAY[i]);
|
||||
return values;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -34,10 +34,23 @@ import java.util.*;
|
|||
/**
|
||||
* A builder class for genotypes
|
||||
*
|
||||
* Provides convenience setter methods for all of the Genotype field
|
||||
* values. Setter methods can be used in any order, allowing you to
|
||||
* pass through states that wouldn't be allowed in the highly regulated
|
||||
* immutable Genotype class.
|
||||
*
|
||||
* All fields default to meaningful MISSING values.
|
||||
*
|
||||
* Call make() to actually create the corresponding Genotype object from
|
||||
* this builder. Can be called multiple times to create independent copies,
|
||||
* or with intervening sets to conveniently make similar Genotypes with
|
||||
* slight modifications.
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 06/12
|
||||
*/
|
||||
public final class GenotypeBuilder {
|
||||
public static boolean MAKE_FAST_BY_DEFAULT = false;
|
||||
public static boolean MAKE_FAST_BY_DEFAULT = true;
|
||||
|
||||
private String sampleName = null;
|
||||
private List<Allele> alleles = null;
|
||||
|
|
@ -48,6 +61,7 @@ public final class GenotypeBuilder {
|
|||
private int[] AD = null;
|
||||
private int[] PL = null;
|
||||
private Map<String, Object> extendedAttributes = null;
|
||||
private int initialAttributeMapSize = 5;
|
||||
|
||||
private boolean useFast = MAKE_FAST_BY_DEFAULT;
|
||||
|
||||
|
|
@ -191,6 +205,11 @@ public final class GenotypeBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set this genotype's name
|
||||
* @param sampleName
|
||||
* @return
|
||||
*/
|
||||
@Requires({"sampleName != null"})
|
||||
@Ensures({"this.sampleName != null"})
|
||||
public GenotypeBuilder name(final String sampleName) {
|
||||
|
|
@ -198,6 +217,11 @@ public final class GenotypeBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set this genotype's alleles
|
||||
* @param alleles
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"this.alleles != null"})
|
||||
public GenotypeBuilder alleles(final List<Allele> alleles) {
|
||||
if ( alleles == null )
|
||||
|
|
@ -207,6 +231,11 @@ public final class GenotypeBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this genotype phased?
|
||||
* @param phased
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder phased(final boolean phased) {
|
||||
isPhased = phased;
|
||||
return this;
|
||||
|
|
@ -235,11 +264,34 @@ public final class GenotypeBuilder {
|
|||
return GQ((int)Math.round(pLog10Error * -10));
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has no GQ value
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder noGQ() { GQ = -1; return this; }
|
||||
|
||||
/**
|
||||
* This genotype has no AD value
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder noAD() { AD = null; return this; }
|
||||
|
||||
/**
|
||||
* This genotype has no DP value
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder noDP() { DP = -1; return this; }
|
||||
|
||||
/**
|
||||
* This genotype has no PL value
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder noPL() { PL = null; return this; }
|
||||
|
||||
/**
|
||||
* This genotype has this DP value
|
||||
* @return
|
||||
*/
|
||||
@Requires({"DP >= -1"})
|
||||
@Ensures({"this.DP == DP"})
|
||||
public GenotypeBuilder DP(final int DP) {
|
||||
|
|
@ -247,6 +299,10 @@ public final class GenotypeBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has this AD value
|
||||
* @return
|
||||
*/
|
||||
@Requires({"AD == null || AD.length > 0"})
|
||||
@Ensures({"this.AD == AD"})
|
||||
public GenotypeBuilder AD(final int[] AD) {
|
||||
|
|
@ -254,6 +310,10 @@ public final class GenotypeBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has this PL value, as int[]. FAST
|
||||
* @return
|
||||
*/
|
||||
@Requires("PL == null || PL.length > 0")
|
||||
@Ensures({"this.PL == PL"})
|
||||
public GenotypeBuilder PL(final int[] PL) {
|
||||
|
|
@ -261,6 +321,10 @@ public final class GenotypeBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has this PL value, converted from double[]. SLOW
|
||||
* @return
|
||||
*/
|
||||
@Requires("PL == null || PL.length > 0")
|
||||
@Ensures({"this.PL == PL"})
|
||||
public GenotypeBuilder PL(final double[] GLs) {
|
||||
|
|
@ -268,6 +332,12 @@ public final class GenotypeBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has these attributes.
|
||||
*
|
||||
* Cannot contain inline attributes (DP, AD, GQ, PL)
|
||||
* @return
|
||||
*/
|
||||
@Requires("attributes != null")
|
||||
@Ensures("attributes.isEmpty() || extendedAttributes != null")
|
||||
public GenotypeBuilder attributes(final Map<String, Object> attributes) {
|
||||
|
|
@ -276,11 +346,17 @@ public final class GenotypeBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has this attribute key / value pair.
|
||||
*
|
||||
* Cannot contain inline attributes (DP, AD, GQ, PL)
|
||||
* @return
|
||||
*/
|
||||
@Requires({"key != null"})
|
||||
@Ensures({"extendedAttributes != null", "extendedAttributes.containsKey(key)"})
|
||||
public GenotypeBuilder attribute(final String key, final Object value) {
|
||||
if ( extendedAttributes == null )
|
||||
extendedAttributes = new HashMap<String, Object>(5);
|
||||
extendedAttributes = new HashMap<String, Object>(initialAttributeMapSize);
|
||||
extendedAttributes.put(key, value);
|
||||
return this;
|
||||
}
|
||||
|
|
@ -299,8 +375,34 @@ public final class GenotypeBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* varargs version of #filters
|
||||
* @param filters
|
||||
* @return
|
||||
*/
|
||||
@Requires("filters != null")
|
||||
public GenotypeBuilder filters(final String ... filters) {
|
||||
return filters(Arrays.asList(filters));
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype is unfiltered
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Requires("filters != null")
|
||||
public GenotypeBuilder unfiltered() {
|
||||
if ( extendedAttributes != null )
|
||||
extendedAttributes.remove(VCFConstants.GENOTYPE_FILTER_KEY);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell's this builder that we have at most these number of attributes
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder maxAttributes(final int i) {
|
||||
// TODO
|
||||
initialAttributeMapSize = i;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -48,6 +48,7 @@ public class GenotypeLikelihoods {
|
|||
return new GenotypeLikelihoods(PLs);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public final static GenotypeLikelihoods fromGLField(String GLs) {
|
||||
return new GenotypeLikelihoods(parseDeprecatedGLString(GLs));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,8 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.variantcontext.writer;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec;
|
||||
|
|
@ -39,7 +41,6 @@ import java.io.*;
|
|||
import java.util.*;
|
||||
|
||||
class BCF2Writer extends IndexingVariantContextWriter {
|
||||
private final static boolean FULLY_DECODE = true;
|
||||
final protected static Logger logger = Logger.getLogger(BCF2Writer.class);
|
||||
|
||||
private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support
|
||||
|
|
@ -63,12 +64,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
|
||||
int offset = 0;
|
||||
for ( VCFContigHeaderLine contig : contigLines )
|
||||
contigDictionary.put(contig.getID(), offset++);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeHeader(final VCFHeader header) {
|
||||
// create the config offsets map
|
||||
|
|
@ -103,8 +98,11 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void add( final VariantContext initialVC ) {
|
||||
final VariantContext vc = FULLY_DECODE ? initialVC.fullyDecode(header) : initialVC;
|
||||
public void add( VariantContext vc ) {
|
||||
if ( doNotWriteGenotypes )
|
||||
vc = new VariantContextBuilder(vc).noGenotypes().make();
|
||||
vc = vc.fullyDecode(header);
|
||||
|
||||
super.add(vc); // allow on the fly indexing
|
||||
|
||||
try {
|
||||
|
|
@ -253,9 +251,13 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO TODO TODO TODO TODO
|
||||
// TODO
|
||||
// TODO -- we really need explicit converters as first class objects
|
||||
// TODO -- need to generalize so we can enable vectors of compressed genotype ints
|
||||
// TODO -- no sense in allocating these over and over
|
||||
// TODO
|
||||
// TODO TODO TODO TODO TODO
|
||||
private final VCFToBCFEncoding prepFieldValueForEncoding(final String field, final Object value) {
|
||||
final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, field);
|
||||
final boolean isList = value instanceof List;
|
||||
|
|
@ -269,8 +271,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
case Flag:
|
||||
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1));
|
||||
case String:
|
||||
final List<String> s = isList ? (List<String>)value : Collections.singletonList((String) value);
|
||||
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s);
|
||||
final String s = isList ? BCF2Utils.collapseStringList((List<String>)value) : (String)value;
|
||||
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(s));
|
||||
case Integer: // note integer calculation is a bit complex because of the need to determine sizes
|
||||
List<Integer> l;
|
||||
BCF2Type intType;
|
||||
|
|
@ -297,19 +299,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
}
|
||||
}
|
||||
|
||||
private final void addGenotypeFilters(final VariantContext vc) throws IOException {
|
||||
logger.warn("Skipping genotype filter field");
|
||||
// // TODO -- FIXME -- string is wrong here -- need to compute string size...
|
||||
// startGenotypeField(VCFConstants.GENOTYPE_FILTER_KEY, 1, BCFType.CHAR);
|
||||
// for ( final Genotype g : vc.getGenotypes() ) {
|
||||
// if ( g.filtersWereApplied() && g.isFiltered() ) {
|
||||
// encoder.encodeString(ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())));
|
||||
// } else {
|
||||
// encoder.encodeRawMissingValues(1, BCFType.CHAR); // todo fixme
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Genotype field encoding
|
||||
|
|
@ -319,10 +308,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
private byte[] buildSamplesData(final VariantContext vc) throws IOException {
|
||||
BCF2Codec.LazyData lazyData = getLazyData(vc);
|
||||
if ( lazyData != null ) {
|
||||
logger.info("Lazy decoded data detected, writing bytes directly to stream. N = " + lazyData.bytes.length);
|
||||
return lazyData.bytes;
|
||||
} else {
|
||||
final GenotypesContext gc = vc.getGenotypes();
|
||||
// we have to do work to convert the VC into a BCF2 byte stream
|
||||
List<String> genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header);
|
||||
for ( final String field : genotypeFields ) {
|
||||
|
|
@ -330,8 +317,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
addGenotypes(vc);
|
||||
} else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) {
|
||||
addIntGenotypeField(field, intGenotypeFieldAccessors.getAccessor(field), vc);
|
||||
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
|
||||
addGenotypeFilters(vc);
|
||||
} else {
|
||||
addGenericGenotypeField(vc, field);
|
||||
}
|
||||
|
|
@ -388,6 +373,9 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
if ( numInFormatField == 1 ) {
|
||||
// we encode the actual allele, encodeRawValue handles the missing case where fieldValue == null
|
||||
encoder.encodeRawValue(fieldValue, encoding.BCF2Type);
|
||||
} else if ( encoding.vcfType == VCFHeaderLineType.String ) {
|
||||
// we have multiple strings, so we need to collapse them
|
||||
throw new ReviewedStingException("LIMITATION: Cannot encode arrays of string values in genotypes");
|
||||
} else {
|
||||
// multiple values, need to handle general case
|
||||
final List<Object> asList = toList(fieldValue);
|
||||
|
|
@ -402,6 +390,12 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO TODO TODO TODO TODO
|
||||
// TODO
|
||||
// TODO THIS ROUTINE NEEDS TO BE OPTIMIZED. IT ACCOUNTS FOR A SIGNIFICANT AMOUNT OF THE
|
||||
// TODO RUNTIME FOR WRITING OUT BCF FILES WITH MANY GENOTYPES
|
||||
// TODO
|
||||
// TODO TODO TODO TODO TODO
|
||||
private final void addIntGenotypeField(final String field,
|
||||
final IntGenotypeFieldAccessors.Accessor ige,
|
||||
final VariantContext vc) throws IOException {
|
||||
|
|
@ -429,13 +423,19 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
encoder.encodePrimitive(pl == -1 ? type.getMissingBytes() : pl, type);
|
||||
}
|
||||
|
||||
// TODO TODO TODO TODO TODO
|
||||
// TODO
|
||||
// TODO we should really have a fast path for encoding diploid genotypes where
|
||||
// TODO we don't pay the overhead of creating the allele maps
|
||||
// TODO
|
||||
// TODO TODO TODO TODO TODO
|
||||
private final void addGenotypes(final VariantContext vc) throws IOException {
|
||||
if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES )
|
||||
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
|
||||
"with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have "
|
||||
+ vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart());
|
||||
|
||||
final Map<Allele, String> alleleMap = VCFWriter.buildAlleleMap(vc);
|
||||
final Map<Allele, Integer> alleleMap = buildAlleleMap(vc);
|
||||
final int maxPloidy = vc.getMaxPloidy();
|
||||
startGenotypeField(VCFConstants.GENOTYPE_KEY, maxPloidy, BCF2Type.INT8);
|
||||
for ( final String name : header.getGenotypeSamples() ) {
|
||||
|
|
@ -446,7 +446,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
if ( i < samplePloidy ) {
|
||||
// we encode the actual allele
|
||||
final Allele a = alleles.get(i);
|
||||
final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a));
|
||||
final int offset = alleleMap.get(a);
|
||||
final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00);
|
||||
encoder.encodePrimitive(encoded, BCF2Type.INT8);
|
||||
} else {
|
||||
|
|
@ -457,6 +457,18 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
}
|
||||
}
|
||||
|
||||
private final static Map<Allele, Integer> buildAlleleMap(final VariantContext vc) {
|
||||
final Map<Allele, Integer> alleleMap = new HashMap<Allele, Integer>(vc.getAlleles().size()+1);
|
||||
alleleMap.put(Allele.NO_CALL, -1); // convenience for lookup
|
||||
|
||||
final List<Allele> alleles = vc.getAlleles();
|
||||
for ( int i = 0; i < alleles.size(); i++ ) {
|
||||
alleleMap.put(alleles.get(i), i);
|
||||
}
|
||||
|
||||
return alleleMap;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Low-level block encoding
|
||||
|
|
@ -470,24 +482,26 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
@Requires({"infoBlock.length > 0", "genotypesBlock.length >= 0"})
|
||||
private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException {
|
||||
assert infoBlock.length > 0;
|
||||
assert genotypesBlock.length >= 0;
|
||||
|
||||
BCF2Encoder.encodePrimitive(infoBlock.length, BCF2Type.INT32, outputStream);
|
||||
BCF2Encoder.encodePrimitive(genotypesBlock.length, BCF2Type.INT32, outputStream);
|
||||
outputStream.write(infoBlock);
|
||||
outputStream.write(genotypesBlock);
|
||||
}
|
||||
|
||||
// TODO -- obvious optimization case
|
||||
@Requires("string != null")
|
||||
@Ensures("BCF2Type.INTEGERS.contains(result)")
|
||||
private final BCF2Type encodeStringByRef(final String string) throws IOException {
|
||||
assert string != null;
|
||||
|
||||
return encodeStringsByRef(Collections.singletonList(string));
|
||||
final Integer offset = stringDictionaryMap.get(string);
|
||||
if ( offset == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF");
|
||||
final BCF2Type type = encoder.determineIntegerType(offset);
|
||||
encoder.encodeTyped(offset, type);
|
||||
return type;
|
||||
}
|
||||
|
||||
// TODO -- in size == 1 case branch to singleoton fast-path
|
||||
@Requires("! strings.isEmpty()")
|
||||
@Ensures("BCF2Type.INTEGERS.contains(result)")
|
||||
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
|
||||
assert ! strings.isEmpty();
|
||||
|
||||
|
|
@ -517,17 +531,38 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
return maxType;
|
||||
}
|
||||
|
||||
@Requires({"key != null", "! key.equals(\"\")", "size >= 0"})
|
||||
private final void startGenotypeField(final String key, final int size, final BCF2Type valueType) throws IOException {
|
||||
assert key != null && ! key.equals("");
|
||||
assert size >= 0;
|
||||
|
||||
encodeStringByRef(key);
|
||||
encoder.encodeType(size, valueType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function that takes an object and returns a list representation
|
||||
* of it:
|
||||
*
|
||||
* o == null => []
|
||||
* o is a list => o
|
||||
* else => [o]
|
||||
*
|
||||
* @param o
|
||||
* @return
|
||||
*/
|
||||
private final static List<Object> toList(final Object o) {
|
||||
if ( o == null ) return Collections.emptyList();
|
||||
else if ( o instanceof List ) return (List<Object>)o;
|
||||
else return Collections.singletonList(o);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the contigDictionary from the contigLines extracted from the VCF header
|
||||
*
|
||||
* @param contigLines
|
||||
*/
|
||||
@Requires("contigDictionary.isEmpty()")
|
||||
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
|
||||
int offset = 0;
|
||||
for ( VCFContigHeaderLine contig : contigLines )
|
||||
contigDictionary.put(contig.getID(), offset++);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -251,7 +251,7 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
}
|
||||
}
|
||||
|
||||
public static Map<Allele, String> buildAlleleMap(final VariantContext vc) {
|
||||
private static Map<Allele, String> buildAlleleMap(final VariantContext vc) {
|
||||
final Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()+1);
|
||||
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
|
||||
|
||||
|
|
|
|||
|
|
@ -161,9 +161,13 @@ public class VariantContextTestProvider {
|
|||
metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x"));
|
||||
|
||||
metaData.add(new VCFInfoHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
|
||||
metaData.add(new VCFInfoHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"));
|
||||
metaData.add(new VCFInfoHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
|
||||
metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
|
||||
metaData.add(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"));
|
||||
metaData.add(new VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
|
||||
metaData.add(new VCFFormatHeaderLine("GS", 2, VCFHeaderLineType.String, "A doubleton list of strings"));
|
||||
metaData.add(new VCFFormatHeaderLine("GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "A variable list of strings"));
|
||||
metaData.add(new VCFFormatHeaderLine("FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype filters"));
|
||||
|
||||
// prep the header
|
||||
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
|
||||
|
||||
|
|
@ -386,7 +390,53 @@ public class VariantContextTestProvider {
|
|||
attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0),
|
||||
attr("g2", ref, "FLOAT3"));
|
||||
|
||||
// test test Integer, Float, Flag, String atomic, vector, and missing types of different lengths per sample
|
||||
//
|
||||
//
|
||||
// TESTING MULTIPLE SIZED LISTS IN THE GENOTYPE FIELD
|
||||
//
|
||||
//
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "GS", Arrays.asList("S1", "S2")),
|
||||
attr("g2", ref, "GS", Arrays.asList("S3", "S4")));
|
||||
|
||||
addGenotypeTests(site, // g1 is missing the string, and g2 is missing FLOAT1
|
||||
attr("g1", ref, "FLOAT1", 1.0),
|
||||
attr("g2", ref, "GS", Arrays.asList("S3", "S4")));
|
||||
|
||||
// variable sized lists
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "GV", Arrays.asList("S1")),
|
||||
attr("g2", ref, "GV", Arrays.asList("S3", "S4")));
|
||||
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "GV", Arrays.asList("S1", "S2")),
|
||||
attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5")));
|
||||
|
||||
addGenotypeTests(site, // missing value in varlist of string
|
||||
attr("g1", ref, "FLOAT1", 1.0),
|
||||
attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5")));
|
||||
|
||||
|
||||
//
|
||||
//
|
||||
// TESTING GENOTYPE FILTERS
|
||||
//
|
||||
//
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1", Arrays.asList(ref, ref)).filters("X").make(),
|
||||
new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X", "Y").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2", Arrays.asList(ref, ref)).filters("X").make(),
|
||||
new GenotypeBuilder("g3", Arrays.asList(ref, ref)).filters("X", "Y").make());
|
||||
|
||||
// TODO -- test test Integer, Float, Flag, String atomic, vector, and missing types of different lengths per sample
|
||||
}
|
||||
|
||||
private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) {
|
||||
|
|
@ -493,7 +543,27 @@ public class VariantContextTestProvider {
|
|||
Assert.assertEquals(actual.getSampleName(), expected.getSampleName());
|
||||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles());
|
||||
Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString());
|
||||
Assert.assertEquals(actual.getType(), expected.getType());
|
||||
|
||||
// filters are the same
|
||||
Assert.assertEquals(actual.getFilters(), expected.getFilters());
|
||||
Assert.assertEquals(actual.isFiltered(), expected.isFiltered());
|
||||
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied());
|
||||
|
||||
// inline attributes
|
||||
Assert.assertEquals(actual.getDP(), expected.getDP());
|
||||
Assert.assertEquals(actual.getAD(), expected.getAD());
|
||||
Assert.assertEquals(actual.getGQ(), expected.getGQ());
|
||||
Assert.assertEquals(actual.hasPL(), expected.hasPL());
|
||||
Assert.assertEquals(actual.hasAD(), expected.hasAD());
|
||||
Assert.assertEquals(actual.hasGQ(), expected.hasGQ());
|
||||
Assert.assertEquals(actual.hasDP(), expected.hasDP());
|
||||
|
||||
Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods());
|
||||
Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString());
|
||||
Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods());
|
||||
Assert.assertEquals(actual.getPL(), expected.getPL());
|
||||
|
||||
Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual());
|
||||
assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes());
|
||||
Assert.assertEquals(actual.isPhased(), expected.isPhased());
|
||||
|
|
|
|||
Loading…
Reference in New Issue