GenotypeLikelihood PLs are capped at Short.MAX_INT now

-- UserExceptions in BCF2 now where appropriate
-- Asserts for code safety
-- Public -> protected encode(Object v) method is for testing only
This commit is contained in:
Mark DePristo 2012-05-18 09:04:59 -04:00
parent d52bc31a47
commit 6301572009
5 changed files with 175 additions and 128 deletions

View File

@ -30,6 +30,7 @@ import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -96,7 +97,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
try {
// note that this reads the magic as well, and so does double duty
if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
throw new IllegalArgumentException("Input stream does not begin with BCF2 magic");
throw new UserException.MalformedBCF2("Input stream does not begin with BCF2 magic");
final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);
@ -224,15 +225,13 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
private void decodeID( final VariantContextBuilder builder ) {
final String id = (String)decoder.decodeTypedValue();
if ( id == null ) {
if ( id == null )
builder.noID();
}
else {
else
builder.id(id);
}
}
public static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
protected static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1);
@ -298,15 +297,16 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
final List<String> samples = new ArrayList<String>(header.getGenotypeSamples());
final int nSamples = siteInfo.nSamples;
final int nFields = siteInfo.nFormatFields;
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(nFields, nSamples);
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(nFields, nSamples);
final List<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
// all of the information we need for each genotype, with default values
final String sampleName = samples.get(i);
List<Allele> alleles = null;
boolean isPhased = false;
@ -318,31 +318,37 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
final String field = entry.getKey();
final List<Object> values = entry.getValue();
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
alleles = decodeGenotypeAlleles(siteInfo.alleles, (List<Integer>)values.get(i));
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
final Integer value = (Integer)values.get(i);
if ( value != BCF2Type.INT8.getMissingJavaValue() )
log10PError = value / -10.0;
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
final List<Integer> pls = (List<Integer>)values.get(i);
if ( pls != null ) { // we have a PL field
log10Likelihoods = new double[pls.size()];
for ( int j = 0; j < log10Likelihoods.length; j++ )
log10Likelihoods[j] = pls.get(j) / -10.0;
}
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
//filters = new HashSet<String>(values.get(i));
} else { // add to attributes
if ( values.get(i) != null ) { // don't add missing values
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
attributes.put(field, values.get(i));
try {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
alleles = decodeGenotypeAlleles(siteInfo.alleles, (List<Integer>)values.get(i));
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
final Integer value = (Integer)values.get(i);
if ( value != BCF2Type.INT8.getMissingJavaValue() )
log10PError = value / -10.0;
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
final List<Integer> pls = (List<Integer>)values.get(i);
if ( pls != null ) { // we have a PL field
log10Likelihoods = new double[pls.size()];
for ( int j = 0; j < log10Likelihoods.length; j++ )
log10Likelihoods[j] = pls.get(j) / -10.0;
}
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
//filters = new HashSet<String>(values.get(i));
} else { // add to attributes
if ( values.get(i) != null ) { // don't add missing values
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
attributes.put(field, values.get(i));
}
}
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value in the "
+ " BCF file. Value was " + Utils.join(",", values));
}
}
if ( alleles == null ) throw new ReviewedStingException("BUG: no alleles found");
if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found");
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
genotypes.add(g);
@ -368,18 +374,24 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
}
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) {
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0);
for ( int i = 0; i < nFields; i++ ) {
final String field = getDictionaryString();
final byte typeDescriptor = decoder.readTypeDescriptor();
final List<Object> values = new ArrayList<Object>(nSamples);
for ( int j = 0; j < nSamples; j++ )
values.add(decoder.decodeTypedValue(typeDescriptor));
map.put(field, values);
if ( nFields == 0 ) // fast path exit for sites only file
return Collections.emptyMap();
else {
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
for ( int i = 0; i < nFields; i++ ) {
final String field = getDictionaryString();
final byte typeDescriptor = decoder.readTypeDescriptor();
final List<Object> values = new ArrayList<Object>(nSamples);
for ( int j = 0; j < nSamples; j++ )
values.add(decoder.decodeTypedValue(typeDescriptor));
map.put(field, values);
}
return map;
}
return map;
}
private final String getDictionaryString() {
@ -387,6 +399,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
}
private final String getDictionaryString(final int offset) {
if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset);
final String field = dictionary.get(offset);
return field;
}

View File

@ -113,6 +113,8 @@ public class BCF2Decoder {
* @param recordBytes
*/
public void setRecordBytes(final byte[] recordBytes) {
assert recordBytes != null;
this.recordBytes = recordBytes;
this.recordStream = new ByteArrayInputStream(recordBytes);
}
@ -145,7 +147,7 @@ public class BCF2Decoder {
for ( int i = 0; i < size; i++ ) {
ints.add(decodeSingleValue(type));
}
return ints.get(0) == null ? null : ints;
return ints.get(0) == null ? null : ints; // return null when all of the values are null
}
}
@ -175,6 +177,7 @@ public class BCF2Decoder {
private final Object decodeLiteralString(final int size) {
assert size > 0;
// TODO -- assumes size > 0
final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array
try {
@ -227,6 +230,8 @@ public class BCF2Decoder {
* @return
*/
private final static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) {
assert blockSizeInBytes >= 0;
final byte[] record = new byte[blockSizeInBytes];
try {
final int bytesRead = inputStream.read(record);
@ -239,6 +244,8 @@ public class BCF2Decoder {
}
private final static void validateReadBytes(final int actuallyRead, final int expected) {
assert expected >= 0;
if ( actuallyRead < expected ) {
throw new UserException.MalformedBCF2(String.format("Failed to read next complete record: %s",
actuallyRead == -1 ?

View File

@ -58,31 +58,6 @@ public class BCF2Encoder {
return bytes;
}
// --------------------------------------------------------------------------------
//
// Super-high level interface
//
// --------------------------------------------------------------------------------
/**
* Totally generic encoder that examines o, determines the best way to encode it, and encodes it
* @param o
* @return
*/
public final BCF2Type encode(final Object o) throws IOException {
if ( o == null ) throw new ReviewedStingException("Generic encode cannot deal with null values");
if ( o instanceof List ) {
final BCF2Type type = determineBCFType(((List) o).get(0));
encodeTyped((List) o, type);
return type;
} else {
final BCF2Type type = determineBCFType(o);
encodeTyped(o, type);
return type;
}
}
// --------------------------------------------------------------------------------
//
// Writing typed values (have type byte)
@ -108,12 +83,6 @@ public class BCF2Encoder {
encodeRawValues(v, type);
}
public final BCF2Type encodeTypedIntOfBestSize(final int value) throws IOException {
final BCF2Type type = determineIntegerType(value);
encodeTyped(value, type);
return type;
}
// --------------------------------------------------------------------------------
//
// Writing raw values (don't have a type byte)
@ -127,17 +96,21 @@ public class BCF2Encoder {
}
public final <T extends Object> void encodeRawValue(final T value, final BCF2Type type) throws IOException {
if ( value == type.getMissingJavaValue() )
encodeRawMissingValue(type);
else {
switch (type) {
case INT8:
case INT16:
case INT32: encodePrimitive((Integer)value, type); break;
case FLOAT: encodeRawFloat((Double) value, type); break;
case CHAR: encodeRawChar((Byte) value); break;
default: throw new ReviewedStingException("Illegal type encountered " + type);
try {
if ( value == type.getMissingJavaValue() )
encodeRawMissingValue(type);
else {
switch (type) {
case INT8:
case INT16:
case INT32: encodePrimitive((Integer)value, type); break;
case FLOAT: encodeRawFloat((Double) value); break;
case CHAR: encodeRawChar((Byte) value); break;
default: throw new ReviewedStingException("Illegal type encountered " + type);
}
}
} catch ( ClassCastException e ) {
throw new ReviewedStingException("BUG: invalid type cast to " + type + " from " + value);
}
}
@ -146,6 +119,8 @@ public class BCF2Encoder {
}
public final void encodeRawMissingValues(final int size, final BCF2Type type) throws IOException {
if ( size <= 0 ) throw new ReviewedStingException("BUG: size <= 0");
for ( int i = 0; i < size; i++ )
encodeRawMissingValue(type);
}
@ -160,15 +135,19 @@ public class BCF2Encoder {
encodeStream.write(c);
}
public final void encodeRawFloat(final double value, final BCF2Type type) throws IOException {
encodePrimitive(Float.floatToIntBits((float)value), type);
public final void encodeRawFloat(final double value) throws IOException {
encodePrimitive(Float.floatToIntBits((float)value), BCF2Type.FLOAT);
}
public final void encodeType(final int size, final BCF2Type type) throws IOException {
if ( size < 0 ) throw new ReviewedStingException("BUG: size < 0");
final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
encodeStream.write(typeByte);
if ( BCF2Utils.willOverflow(size) )
encodeTypedIntOfBestSize(size);
if ( BCF2Utils.willOverflow(size) ) {
// write in the overflow size
encodeTyped(size, determineIntegerType(size));
}
}
public final void encodeRawInt(final int value, final BCF2Type type) throws IOException {
@ -223,6 +202,28 @@ public class BCF2Encoder {
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
/**
* Totally generic encoder that examines o, determines the best way to encode it, and encodes it
*
* This method is incredibly slow, but it's only used for UnitTests so it doesn't matter
*
* @param o
* @return
*/
protected final BCF2Type encode(final Object o) throws IOException {
if ( o == null ) throw new ReviewedStingException("Generic encode cannot deal with null values");
if ( o instanceof List ) {
final BCF2Type type = determineBCFType(((List) o).get(0));
encodeTyped((List) o, type);
return type;
} else {
final BCF2Type type = determineBCFType(o);
encodeTyped(o, type);
return type;
}
}
private final BCF2Type determineBCFType(final Object arg) {
final Object toType = arg instanceof List ? ((List)arg).get(0) : arg;
@ -246,6 +247,8 @@ public class BCF2Encoder {
}
private final List<Byte> stringToBytes(final String v) throws IOException {
assert v != null && !v.equals("");
// TODO -- this needs to be optimized away for efficiency
final byte[] bytes = v.getBytes();
final List<Byte> l = new ArrayList<Byte>(bytes.length);

View File

@ -33,6 +33,8 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.EnumMap;
public class GenotypeLikelihoods {
public final static int MAX_PL = Short.MAX_VALUE;
//
// There are two objects here because we are lazy in creating both representations
// for this object: a vector of log10 Probs and the PL phred-scaled string. Supports
@ -209,7 +211,7 @@ public class GenotypeLikelihoods {
final double adjust = maxPL(GLs);
for ( int i = 0; i < GLs.length; i++ ) {
pls[i] = (int)Math.round(-10 * (GLs[i] - adjust));
pls[i] = (int)Math.round(Math.min(-10 * (GLs[i] - adjust), MAX_PL));
}
return pls;

View File

@ -146,7 +146,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
// qual
if ( vc.hasLog10PError() )
encoder.encodeRawFloat((float) vc.getPhredScaledQual(), BCF2Type.FLOAT);
encoder.encodeRawFloat((float) vc.getPhredScaledQual());
else
encoder.encodeRawMissingValue(BCF2Type.FLOAT);
@ -183,7 +183,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
if ( vc.isFiltered() ) {
encodeStringsByRef(vc.getFilters());
} else {
encoder.encodeTypedMissing(BCF2Type.INT32);
encoder.encodeTypedMissing(BCF2Type.INT8);
}
}
@ -198,7 +198,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
private byte[] buildSamplesData(final VariantContext vc) throws IOException {
// write size
List<String> genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc);
for ( final String field : genotypeFields ) {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
@ -219,6 +218,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
private final int getNGenotypeFieldValues(final String field, final VariantContext vc) {
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field);
assert metaData != null; // field is supposed to be in header
int nFields = metaData.getCount(vc.getNAlleles() - 1);
if ( nFields == -1 ) { // unbounded, need to look at values
return computeMaxSizeOfGenotypeFieldFromValues(field, vc);
@ -266,7 +267,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
throw new ReviewedStingException("BUG: header for " + field +
" has inconsistent number of values " + numInFormatField +
" compared to values in VariantContext " + ((List) val).size());
final Collection<Object> vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val;
final List<Object> vals = numInFormatField == 1 ? Collections.singletonList(val) : (List<Object>)val;
encoder.encodeRawValues(vals, encoding.BCF2Type);
}
}
@ -275,12 +276,12 @@ class BCF2Writer extends IndexingVariantContextWriter {
private final class VCFToBCFEncoding {
VCFHeaderLineType vcfType;
BCF2Type BCF2Type;
List<Object> valuesToEncode;
List<? extends Object> valuesToEncode;
private VCFToBCFEncoding(final VCFHeaderLineType vcfType, final BCF2Type BCF2Type, final List<? extends Object> valuesToEncode) {
this.vcfType = vcfType;
this.BCF2Type = BCF2Type;
this.valuesToEncode = (List<Object>)valuesToEncode;
this.valuesToEncode = valuesToEncode;
}
}
@ -292,33 +293,39 @@ class BCF2Writer extends IndexingVariantContextWriter {
final boolean isList = value instanceof List;
final Object toType = isList ? ((List)value).get(0) : value;
switch ( metaData.getType() ) {
case Character:
assert toType instanceof String;
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(value));
case Flag:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1));
case String:
final List<String> s = isList ? (List<String>)value : Collections.singletonList((String) value);
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s);
case Integer: // note integer calculation is a bit complex because of the need to determine sizes
List<Integer> l;
BCF2Type intType;
if ( isList ) {
l = (List<Integer>)value;
intType = encoder.determineIntegerType(l);
} else if ( value != null ) {
intType = encoder.determineIntegerType((Integer)value);
l = Collections.singletonList((Integer)value);
} else {
intType = BCF2Type.INT8;
l = Collections.singletonList((Integer) null);
}
return new VCFToBCFEncoding(metaData.getType(), intType, l);
case Float:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.FLOAT, isList ? (List<Double>)value : Collections.singletonList(value));
default:
throw new ReviewedStingException("Unexpected type for field " + field);
try {
switch ( metaData.getType() ) {
case Character:
assert toType instanceof String;
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(value));
case Flag:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1));
case String:
final List<String> s = isList ? (List<String>)value : Collections.singletonList((String) value);
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s);
case Integer: // note integer calculation is a bit complex because of the need to determine sizes
List<Integer> l;
BCF2Type intType;
if ( isList ) {
l = (List<Integer>)value;
intType = encoder.determineIntegerType(l);
} else if ( value != null ) {
intType = encoder.determineIntegerType((Integer)value);
l = Collections.singletonList((Integer)value);
} else {
intType = BCF2Type.INT8;
l = Collections.singletonList((Integer) null);
}
return new VCFToBCFEncoding(metaData.getType(), intType, l);
case Float:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.FLOAT, isList ? (List<Double>)value : Collections.singletonList(value));
default:
throw new ReviewedStingException("Unexpected type for field " + field);
}
} catch ( ClassCastException e ) {
throw new ReviewedStingException("Error computing VCF -> BCF encoding. Received cast class exception"
+ " indicating that the VCF header for " + metaData + " is inconsistent with the" +
" value seen in the VariantContext object = " + value, e);
}
}
@ -404,17 +411,26 @@ class BCF2Writer extends IndexingVariantContextWriter {
* @throws IOException
*/
private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException {
assert infoBlock.length > 0;
assert genotypesBlock.length >= 0;
BCF2Encoder.encodePrimitive(infoBlock.length, BCF2Type.INT32, outputStream);
BCF2Encoder.encodePrimitive(genotypesBlock.length, BCF2Type.INT32, outputStream);
outputStream.write(infoBlock);
outputStream.write(genotypesBlock);
}
public final BCF2Type encodeStringByRef(final String string) throws IOException {
return encodeStringsByRef(Collections.singleton(string));
// TODO -- obvious optimization case
private final BCF2Type encodeStringByRef(final String string) throws IOException {
assert string != null;
return encodeStringsByRef(Collections.singletonList(string));
}
public final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
// TODO -- in size == 1 case branch to singleoton fast-path
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
assert ! strings.isEmpty();
final List<Integer> offsets = new ArrayList<Integer>(strings.size());
BCF2Type maxType = BCF2Type.INT8; // start with the smallest size
@ -424,12 +440,15 @@ class BCF2Writer extends IndexingVariantContextWriter {
if ( got == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF");
final int offset = got;
offsets.add(offset);
final BCF2Type type1 = encoder.determineIntegerType(offset);
switch ( type1 ) {
case INT8: break;
case INT16: if ( maxType == BCF2Type.INT8 ) maxType = BCF2Type.INT16; break;
case INT32: maxType = BCF2Type.INT32; break;
default: throw new ReviewedStingException("Unexpected type " + type1);
if ( maxType != BCF2Type.INT32) { // don't bother looking if we already are at 32 bit ints
final BCF2Type type1 = encoder.determineIntegerType(offset);
switch ( type1 ) {
case INT8: break;
case INT16: if ( maxType == BCF2Type.INT8 ) maxType = BCF2Type.INT16; break;
case INT32: maxType = BCF2Type.INT32; break;
default: throw new ReviewedStingException("Unexpected type " + type1);
}
}
}
@ -438,7 +457,10 @@ class BCF2Writer extends IndexingVariantContextWriter {
return maxType;
}
public final void startGenotypeField(final String key, final int size, final BCF2Type valueType) throws IOException {
private final void startGenotypeField(final String key, final int size, final BCF2Type valueType) throws IOException {
assert key != null && ! key.equals("");
assert size >= 0;
encodeStringByRef(key);
encoder.encodeType(size, valueType);
}