No more makePrecisionFormatStringFromDenominatorValue

-- As values in VCs are becoming their native Java types the VCFWriter needs to own proper float formating.
-- Created a smart float formatter in VCFWriter, with unit tests
-- Removed makePrecisionFormatStringFromDenominatorValue and its uses
-- Fix broken contracted
-- Refactored some code from the encoder to utils in BCF2
-- HaplotypeCaller's GenotypingEngine was using old version of subset to context.  Replaced with a faster call that I think is correct. Ryan, please confirm.
This commit is contained in:
Mark DePristo 2012-06-10 08:55:23 -04:00
parent a822087f11
commit 51a3b6e25e
11 changed files with 126 additions and 67 deletions

View File

@ -434,7 +434,7 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
final public int nGenotypeFields;
final public byte[] bytes;
@Requires({"nGenotypeField > 0", "bytes != null"})
@Requires({"nGenotypeFields > 0", "bytes != null"})
public LazyData(final int nGenotypeFields, final byte[] bytes) {
this.nGenotypeFields = nGenotypeFields;
this.bytes = bytes;
@ -446,7 +446,7 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
return getDictionaryString((Integer) decoder.decodeTypedValue());
}
@Requires("offset >= dictionary.size()")
@Requires("offset < dictionary.size()")
@Ensures("result != null")
protected final String getDictionaryString(final int offset) {
return dictionary.get(offset);

View File

@ -163,7 +163,7 @@ public final class BCF2Encoder {
encodeStream.write(typeByte);
if ( BCF2Utils.willOverflow(size) ) {
// write in the overflow size
encodeTyped(size, determineIntegerType(size));
encodeTyped(size, BCF2Utils.determineIntegerType(size));
}
}
@ -181,42 +181,13 @@ public final class BCF2Encoder {
//
// --------------------------------------------------------------------------------
public final BCF2Type determineIntegerType(final int[] values) {
// literally a copy of the code below, but there's no general way to unify lists and arrays in java
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
public final BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
public final BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : BCF2Utils.INTEGER_TYPES_BY_SIZE ) {
if ( potentialType.withinRange(value) )
return potentialType;
}
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
public void encodeString(final String s, final int sizeToWrite) throws IOException {
final byte[] bytes = s.getBytes();
for ( int i = 0; i < sizeToWrite; i++ )
if ( i < bytes.length )
encodeRawChar(bytes[i]);
else
encodeRawMissingValue(BCF2Type.CHAR);
}
/**
@ -245,7 +216,7 @@ public final class BCF2Encoder {
final Object toType = arg instanceof List ? ((List)arg).get(0) : arg;
if ( toType instanceof Integer )
return determineIntegerType((Integer)toType);
return BCF2Utils.determineIntegerType((Integer) toType);
else if ( toType instanceof String )
return BCF2Type.CHAR;
else if ( toType instanceof Double )

View File

@ -219,4 +219,42 @@ public final class BCF2Utils {
else
return new File( path + ".bcf" );
}
public final static BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
if ( potentialType.withinRange(value) )
return potentialType;
}
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
public final static BCF2Type determineIntegerType(final int[] values) {
// literally a copy of the code below, but there's no general way to unify lists and arrays in java
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
public final static BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
}

View File

@ -114,7 +114,5 @@ public final class VCFConstants {
public static final String EMPTY_GENOTYPE = "./.";
public static final int MAX_GENOTYPE_QUAL = 99;
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
public static final String DOUBLE_PRECISION_INT_SUFFIX = ".00";
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
}

View File

@ -390,7 +390,6 @@ public final class GenotypeBuilder {
*
* @return
*/
@Requires("filters != null")
public GenotypeBuilder unfiltered() {
if ( extendedAttributes != null )
extendedAttributes.remove(VCFConstants.GENOTYPE_FILTER_KEY);

View File

@ -111,8 +111,7 @@ public class VariantContextUtils {
if ( AN == 0 ) {
alleleFreqs.add(0.0);
} else {
// todo -- this is a performance problem
final Double freq = Double.valueOf(String.format(makePrecisionFormatStringFromDenominatorValue(totalFoundersChromosomes), ((double)foundersAltChromosomes / totalFoundersChromosomes)));
final Double freq = (double)foundersAltChromosomes / totalFoundersChromosomes;
alleleFreqs.add(freq);
}
}
@ -155,17 +154,6 @@ public class VariantContextUtils {
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, founderIds));
}
public static String makePrecisionFormatStringFromDenominatorValue(double maxValue) {
int precision = 1;
while ( maxValue > 1 ) {
precision++;
maxValue /= 10.0;
}
return "%." + precision + "f";
}
public static Genotype removePLs(Genotype g) {
if ( g.hasLikelihoods() )
return new GenotypeBuilder(g).noPL().make();

View File

@ -209,7 +209,7 @@ class VCFWriter extends IndexingVariantContextWriter {
if ( !vc.hasLog10PError() )
mWriter.write(VCFConstants.MISSING_VALUE_v4);
else
mWriter.write(getQualValue(vc.getPhredScaledQual()));
mWriter.write(formatQualValue(vc.getPhredScaledQual()));
mWriter.write(VCFConstants.FIELD_SEPARATOR);
// FILTER
@ -277,10 +277,13 @@ class VCFWriter extends IndexingVariantContextWriter {
return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
}
private String getQualValue(double qual) {
String s = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, qual);
if ( s.endsWith(VCFConstants.DOUBLE_PRECISION_INT_SUFFIX) )
s = s.substring(0, s.length() - VCFConstants.DOUBLE_PRECISION_INT_SUFFIX.length());
private static final String QUAL_FORMAT_STRING = "%.2f";
private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00";
private String formatQualValue(double qual) {
String s = String.format(QUAL_FORMAT_STRING, qual);
if ( s.endsWith(QUAL_FORMAT_EXTENSION_TO_TRIM) )
s = s.substring(0, s.length() - QUAL_FORMAT_EXTENSION_TO_TRIM.length());
return s;
}
@ -431,12 +434,39 @@ class VCFWriter extends IndexingVariantContextWriter {
mWriter.write(encoding);
}
/**
* Takes a double value and pretty prints it to a String for display
*
* Large doubles => gets %.2f style formatting
* Doubles < 1 / 10 but > 1/100 </>=> get %.3f style formatting
* Double < 1/100 => %.3e formatting
* @param d
* @return
*/
public static final String formatVCFDouble(final double d) {
String format = "%.2f";
if ( d < 0.1 ) {
if ( d < 0.01 ) {
if ( Math.abs(d) >= 1e-20 )
format = "%.3e";
else {
// return a zero format
return "0.00";
}
} else {
format = "%.3f";
}
}
return String.format(format, d);
}
public static String formatVCFField(Object val) {
String result;
if ( val == null )
result = VCFConstants.MISSING_VALUE_v4;
else if ( val instanceof Double )
result = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, (Double)val);
result = formatVCFDouble((Double) val);
else if ( val instanceof Boolean )
result = (Boolean)val ? "" : null; // empty string for true, null for false
else if ( val instanceof List ) {

View File

@ -6,7 +6,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.testng.Assert;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderUnitTest;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderUnitTest;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.testng.annotations.Test;

View File

@ -317,8 +317,8 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest {
@Test(dataProvider = "BestIntTypeTests")
public void determineBestEncoding(final List<Integer> ints, final BCF2Type expectedType) throws IOException {
BCF2Encoder encoder = new BCF2Encoder();
Assert.assertEquals(encoder.determineIntegerType(ints), expectedType);
Assert.assertEquals(encoder.determineIntegerType(ArrayUtils.toPrimitive(ints.toArray(new Integer[0]))), expectedType);
Assert.assertEquals(BCF2Utils.determineIntegerType(ints), expectedType);
Assert.assertEquals(BCF2Utils.determineIntegerType(ArrayUtils.toPrimitive(ints.toArray(new Integer[0]))), expectedType);
}
// -----------------------------------------------------------------

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.utils.genotype.vcf;
package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.utils.genotype.vcf;
package org.broadinstitute.sting.utils.variantcontext.writer;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broad.tribble.AbstractFeatureReader;
@ -15,10 +15,12 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.broadinstitute.sting.utils.variantcontext.writer.VCFWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
@ -147,4 +149,37 @@ public class VCFWriterUnitTest extends BaseTest {
}
Assert.assertEquals(index, additionalColumns.size());
}
@DataProvider(name = "VCFWriterDoubleFormatTestData")
public Object[][] makeVCFWriterDoubleFormatTestData() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{1.0, "1.00"});
tests.add(new Object[]{10.1, "10.10"});
tests.add(new Object[]{10.01, "10.01"});
tests.add(new Object[]{10.012, "10.01"});
tests.add(new Object[]{10.015, "10.02"});
tests.add(new Object[]{0.0, "0.00"});
tests.add(new Object[]{0.5, "0.50"});
tests.add(new Object[]{0.55, "0.55"});
tests.add(new Object[]{0.555, "0.56"});
tests.add(new Object[]{0.1, "0.10"});
tests.add(new Object[]{0.050, "0.050"});
tests.add(new Object[]{0.010, "0.010"});
tests.add(new Object[]{0.012, "0.012"});
tests.add(new Object[]{0.0012, "1.200e-03"});
tests.add(new Object[]{1.2e-4, "1.200e-04"});
tests.add(new Object[]{1.21e-4, "1.210e-04"});
tests.add(new Object[]{1.212e-5, "1.212e-05"});
tests.add(new Object[]{1.2123e-6, "1.212e-06"});
tests.add(new Object[]{Double.POSITIVE_INFINITY, "Infinity"});
tests.add(new Object[]{Double.NEGATIVE_INFINITY, "-Infinity"});
tests.add(new Object[]{Double.NaN, "NaN"});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "VCFWriterDoubleFormatTestData")
public void testVCFWriterDoubleFormatTestData(final double d, final String expected) {
Assert.assertEquals(VCFWriter.formatVCFDouble(d), expected, "Failed to pretty print double in VCFWriter");
}
}