Genotype encoding uses the BCF2FieldEncoder system

This commit is contained in:
Mark DePristo 2012-06-12 15:44:12 -04:00
parent 9ac4203254
commit dd6aee347a
8 changed files with 495 additions and 369 deletions

View File

@ -220,6 +220,7 @@ public final class BCF2Utils {
return new File( path + ".bcf" );
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
if ( potentialType.withinRange(value) )
@ -229,6 +230,7 @@ public final class BCF2Utils {
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final int[] values) {
// literally a copy of the code below, but there's no general way to unify lists and arrays in java
BCF2Type maxType = BCF2Type.INT8;
@ -244,6 +246,27 @@ public final class BCF2Utils {
return maxType;
}
/**
* Returns the maximum BCF2 integer size of t1 and t2
*
* For example, if t1 == INT8 and t2 == INT16 returns INT16
*
* @param t1
* @param t2
* @return
*/
@Requires({"BCF2Type.INTEGERS.contains(t1)","BCF2Type.INTEGERS.contains(t2)"})
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
switch ( t1 ) {
case INT8: return t2;
case INT16: return t2 == BCF2Type.INT32 ? t2 : t1;
case INT32: return t1;
default: throw new ReviewedStingException("BUG: unexpected BCF2Type " + t1);
}
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
@ -257,4 +280,21 @@ public final class BCF2Utils {
}
return maxType;
}
/**
* Helper function that takes an object and returns a list representation
* of it:
*
* o == null => []
* o is a list => o
* else => [o]
*
* @param o
* @return
*/
public final static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
}

View File

@ -56,8 +56,9 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
public String getDescription() { return description; }
public VCFHeaderLineType getType() { return type; }
public VCFHeaderLineCount getCountType() { return countType; }
public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; }
public int getCount() {
if ( countType != VCFHeaderLineCount.INTEGER )
if ( ! isFixedCount() )
throw new ReviewedStingException("Asking for header line count when type is not an integer");
return count;
}

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.variantcontext.writer;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
@ -44,6 +46,11 @@ import java.util.Map;
* @author Your Name
* @since Date created
*/
@Invariant({
"headerLine != null",
"BCF2Type.INTEGERS.contains(dictionaryOffsetType)",
"dictionaryOffset >= 0"
})
public abstract class BCF2FieldEncoder {
final VCFCompoundHeaderLine headerLine;
final BCF2Type fixedType;
@ -72,16 +79,41 @@ public abstract class BCF2FieldEncoder {
public boolean hasUnboundedCount() { return getCountType() == VCFHeaderLineCount.UNBOUNDED; }
public boolean hasContextDeterminedCount() { return ! hasFixedCount() && ! hasUnboundedCount(); }
//
// TODO -- this class should own two clean methods
//
// Tell us whether the type and size are static, determined by from the VC itself,
// or from the actual encoded values. If the last case, provide a function that tell us
// the encoding type and size of the underlying data, given a single value.
//
// TODO -- cleanup logic of counts
// todo -- differentiate between the VCF header declared size and the encoded size
// TODO -- for example, getUnboundedCount should be getCountFromSizeOfValue()
//
// GenotypeEncoders need to inspect the size properties of the underlying encoder
// and determine how (and whether) they need to iterate once through the data to
// determine max size (for padding)
//
@Requires("hasFixedCount()")
public int getFixedCount() { return headerLine.getCount(); }
public int getFixedCount() {
return headerLine.getCount();
}
public int getUnboundedCount(final Object value) {
return value instanceof List ? ((List) value).size() : 1;
}
public int getContextDeterminedCount(final VariantContext vc) {
return headerLine.getCount(vc.getNAlleles() - 1);
}
public int getBCFFieldCount(final VariantContext vc, final Object value) {
if ( hasFixedCount() )
return getFixedCount();
else if ( hasUnboundedCount() )
return value instanceof List ? ((List) value).size() : 1;
return getUnboundedCount(value);
else
return getContextDeterminedCount(vc);
}
@ -107,7 +139,11 @@ public abstract class BCF2FieldEncoder {
return "BCF2FieldEncoder for " + getField() + " with count " + getCountType() + " encoded with " + getClass().getSimpleName();
}
public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException;
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException {
encodeValue(encoder, value, type, 0);
}
public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException;
/**
@ -133,10 +169,10 @@ public abstract class BCF2FieldEncoder {
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException {
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
if ( value != null ) {
final String s = encodeString(value);
encoder.encodeString(s, s.length());
encoder.encodeString(s, Math.max(s.length(), minValues));
}
}
@ -153,7 +189,7 @@ public abstract class BCF2FieldEncoder {
public static class Flag extends BCF2FieldEncoder {
public Flag(final VCFCompoundHeaderLine headerLine, final BCF2Encoder encoder, final Map<String, Integer> dict ) {
super(headerLine, encoder, dict, BCF2Type.INT8);
if ( getHeaderLine().getCount() != 0 )
if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 )
throw new ReviewedStingException("Flag encoder only suppports atomic flags!");
}
@ -163,7 +199,8 @@ public abstract class BCF2FieldEncoder {
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException {
@Requires("minValues <= 1")
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
encoder.encodePrimitive(1, getFixedType());
}
}
@ -174,10 +211,14 @@ public abstract class BCF2FieldEncoder {
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException {
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
final List<Double> doubles = toList(Double.class, value);
for ( final double d : doubles )
int count = 0;
for ( final double d : doubles ) {
encoder.encodeRawFloat(d);
count++;
}
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
}
}
@ -186,15 +227,26 @@ public abstract class BCF2FieldEncoder {
super(headerLine, encoder, dict, null);
}
@Override
public int getUnboundedCount(final Object value) {
return value == null ? 0 : ((int[])value).length;
}
@Override
public BCF2Type getDynamicType(final Object value) {
return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((int[])value);
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException {
for ( final int i : (int[])value )
encoder.encodeRawInt(i, type);
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
int count = 0;
if ( value != null ) {
for ( final int i : (int[])value ) {
encoder.encodeRawInt(i, type);
count++;
}
}
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
}
}
@ -209,9 +261,13 @@ public abstract class BCF2FieldEncoder {
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException {
for ( final int i : toList(Integer.class, value) )
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
int count = 0;
for ( final int i : toList(Integer.class, value) ) {
encoder.encodeRawInt(i, type);
count++;
}
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
}
}
@ -226,8 +282,9 @@ public abstract class BCF2FieldEncoder {
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException {
encoder.encodeRawInt((Integer)value, type);
@Requires("minValues <= 1") // 0 is ok as this means no values need to be encoded
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
encoder.encodeRawInt(value == null ? type.getMissingBytes() : (Integer)value, type);
}
}
}

View File

@ -26,9 +26,18 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* [Short one sentence description of this walker]
@ -58,43 +67,53 @@ import java.io.IOException;
* @since Date created
*/
public abstract class BCF2FieldWriter {
private final VCFHeader header;
private final BCF2FieldEncoder fieldEncoder;
protected BCF2FieldWriter(final BCF2FieldEncoder fieldEncoder) {
protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
this.header = header;
this.fieldEncoder = fieldEncoder;
}
protected VCFHeader getHeader() { return header; }
protected BCF2FieldEncoder getFieldEncoder() {
return fieldEncoder;
}
protected String getField() { return getFieldEncoder().getField(); }
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
encoder.encodeTyped(fieldEncoder.getDictionaryOffset(), fieldEncoder.getDictionaryOffsetType());
}
public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { }
public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { } // TODO -- overload done so that we null out values and test for correctness
@Override
public String toString() {
return "BCF2FieldWriter " + getClass().getSimpleName() + " with encoder " + getFieldEncoder();
}
// --------------------------------------------------------------------------------
//
// Sites writers
//
// --------------------------------------------------------------------------------
public static abstract class SiteWriter extends BCF2FieldWriter {
protected SiteWriter(final BCF2FieldEncoder fieldEncoder) {
super(fieldEncoder);
protected SiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
}
public abstract void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException;
}
public static class GenericSiteWriter extends SiteWriter {
public GenericSiteWriter(final BCF2FieldEncoder fieldEncoder) {
super(fieldEncoder);
public GenericSiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
}
@Override
public void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
final Object rawValue = vc.getAttribute(getFieldEncoder().getField(), null);
final Object rawValue = vc.getAttribute(getField(), null);
final BCF2Type type = getFieldEncoder().getType(rawValue);
if ( rawValue == null ) {
// the value is missing, just write in null
@ -106,5 +125,189 @@ public abstract class BCF2FieldWriter {
}
}
}
// --------------------------------------------------------------------------------
//
// Genotypes writers
//
// --------------------------------------------------------------------------------
public static abstract class GenotypesWriter extends BCF2FieldWriter {
int nValuesPerGenotype = -1;
BCF2Type encodingType = null;
protected GenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
if ( fieldEncoder.hasFixedCount() ) {
nValuesPerGenotype = getFieldEncoder().getFixedCount();
}
}
@Override
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
// writes the key information
super.start(encoder, vc);
// only update if we need to
if ( ! getFieldEncoder().hasFixedCount() ) {
if ( getFieldEncoder().hasContextDeterminedCount() )
// we are cheap -- just depends on genotype of allele counts
nValuesPerGenotype = getFieldEncoder().getContextDeterminedCount(vc);
else
// we have to go fishing through the values themselves (expensive)
nValuesPerGenotype = computeMaxSizeOfGenotypeFieldFromValues(vc);
}
encoder.encodeType(nValuesPerGenotype, encodingType);
}
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
final Object fieldValue = g.getAttribute(getField(), null);
getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype);
}
public Object getGenotypeValue(final Genotype g) {
return g.getAttribute(getField());
}
private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) {
int size = -1;
for ( final Genotype g : vc.getGenotypes() ) {
final Object o = getGenotypeValue(g);
size = Math.max(size, getFieldEncoder().getBCFFieldCount(vc, o));
}
return size;
}
}
public static class FixedTypeGenotypesWriter extends GenotypesWriter {
public FixedTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
encodingType = getFieldEncoder().getFixedType();
}
}
public static class IntegerTypeGenotypesWriter extends GenotypesWriter {
public IntegerTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
}
@Override
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
// the only value that is dynamic are integers
final List<Integer> values = new ArrayList<Integer>(vc.getNSamples());
for ( final Genotype g : vc.getGenotypes() ) {
for ( final Object i : BCF2Utils.toList(g.getAttribute(getField(), null)) ) {
values.add((Integer)i); // we know they are all integers
}
}
encodingType = BCF2Utils.determineIntegerType(values);
super.start(encoder, vc);
}
}
// TODO TODO TODO TODO TODO
// TODO
// TODO THIS ROUTINE NEEDS TO BE OPTIMIZED. IT ACCOUNTS FOR A SIGNIFICANT AMOUNT OF THE
// TODO RUNTIME FOR WRITING OUT BCF FILES WITH MANY GENOTYPES
// TODO
// TODO TODO TODO TODO TODO
public static class IGFGenotypesWriter extends GenotypesWriter {
final IntGenotypeFieldAccessors.Accessor ige;
public IGFGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder, final IntGenotypeFieldAccessors.Accessor ige) {
super(header, fieldEncoder);
this.ige = ige;
if ( ! (fieldEncoder instanceof BCF2FieldEncoder.IntArray) )
throw new ReviewedStingException("BUG: IntGenotypesWriter requires IntArray encoder for field " + getField());
}
@Override
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
encodingType = BCF2Type.INT8;
for ( final Genotype g : vc.getGenotypes() ) {
final int[] pls = ige.getValues(g);
final BCF2Type plsType = getFieldEncoder().getType(pls);
encodingType = BCF2Utils.maxIntegerType(encodingType, plsType);
if ( encodingType == BCF2Type.INT32 )
break; // stop early
}
super.start(encoder, vc);
}
@Override
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
getFieldEncoder().encodeValue(encoder, ige.getValues(g), encodingType, nValuesPerGenotype);
}
@Override
public Object getGenotypeValue(final Genotype g) {
return ige.getValues(g);
}
}
// TODO TODO TODO TODO TODO
// TODO
// TODO we should really have a fast path for encoding diploid genotypes where
// TODO we don't pay the overhead of creating the allele maps
// TODO
// TODO TODO TODO TODO TODO
public static class GTWriter extends GenotypesWriter {
Map<Allele, Integer> alleleMap = null;
public GTWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
}
@Override
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES )
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
"with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have "
+ vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart());
encodingType = BCF2Type.INT8;
alleleMap = buildAlleleMap(vc);
nValuesPerGenotype = vc.getMaxPloidy();
super.start(encoder, vc); //To change body of overridden methods use File | Settings | File Templates.
}
@Override
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
final List<Allele> alleles = g.getAlleles();
final int samplePloidy = alleles.size();
for ( int i = 0; i < nValuesPerGenotype; i++ ) {
if ( i < samplePloidy ) {
// we encode the actual allele
final Allele a = alleles.get(i);
final int offset = alleleMap.get(a);
final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00);
encoder.encodePrimitive(encoded, encodingType);
} else {
// we need to pad with missing as we have ploidy < max for this sample
encoder.encodePrimitive(encodingType.getMissingBytes(), encodingType);
}
}
}
private final static Map<Allele, Integer> buildAlleleMap(final VariantContext vc) {
final Map<Allele, Integer> alleleMap = new HashMap<Allele, Integer>(vc.getAlleles().size()+1);
alleleMap.put(Allele.NO_CALL, -1); // convenience for lookup
final List<Allele> alleles = vc.getAlleles();
for ( int i = 0; i < alleles.size(); i++ ) {
alleleMap.put(alleles.get(i), i);
}
return alleleMap;
}
}
}

View File

@ -26,10 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.HashMap;
@ -65,6 +62,8 @@ import java.util.Map;
public class BCF2FieldWriterManager {
final protected static Logger logger = Logger.getLogger(BCF2FieldWriterManager.class);
final Map<String, BCF2FieldWriter.SiteWriter> siteWriters = new HashMap<String, BCF2FieldWriter.SiteWriter>();
final Map<String, BCF2FieldWriter.GenotypesWriter> genotypesWriters = new HashMap<String, BCF2FieldWriter.GenotypesWriter>();
final IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();
public BCF2FieldWriterManager() { }
@ -72,41 +71,108 @@ public class BCF2FieldWriterManager {
for (final VCFHeaderLine line : header.getMetaData()) {
if ( line instanceof VCFInfoHeaderLine ) {
final String field = ((VCFInfoHeaderLine) line).getID();
final BCF2FieldWriter.SiteWriter writer = createInfoWriter((VCFInfoHeaderLine)line, encoder, dictionary);
logger.info("Installing for field " + field + " field writer " + writer);
final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, (VCFInfoHeaderLine)line, encoder, dictionary);
log(field, writer);
siteWriters.put(field, writer);
} else if ( line instanceof VCFFormatHeaderLine ) {
final String field = ((VCFFormatHeaderLine) line).getID();
final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, (VCFFormatHeaderLine)line, encoder, dictionary);
log(field, writer);
genotypesWriters.put(field, writer);
}
}
}
private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFInfoHeaderLine line, final BCF2Encoder encoder, final Map<String, Integer> dict) {
BCF2FieldEncoder fieldEncoder = null;
switch ( line.getType() ) {
case Character:
case String:
fieldEncoder = new BCF2FieldEncoder.StringOrCharacter(line, encoder, dict);
break;
case Flag:
fieldEncoder = new BCF2FieldEncoder.Flag(line, encoder, dict);
break;
case Float:
fieldEncoder = new BCF2FieldEncoder.Float(line, encoder, dict);
break;
case Integer:
if ( line.getCountType() == VCFHeaderLineCount.INTEGER && line.getCount() == 1 )
fieldEncoder = new BCF2FieldEncoder.AtomicInt(line, encoder, dict);
else
fieldEncoder = new BCF2FieldEncoder.IntList(line, encoder, dict);
break;
default:
throw new ReviewedStingException("Unexpected type for field " + line.getID());
}
return new BCF2FieldWriter.GenericSiteWriter(fieldEncoder);
private final void log(final String field, final BCF2FieldWriter writer) {
logger.info("Using writer " + writer);
}
// -----------------------------------------------------------------
//
// Master routine to look at the header, a specific line, and
// build an appropriate SiteWriter for that header element
//
// -----------------------------------------------------------------
private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFHeader header,
final VCFInfoHeaderLine line,
final BCF2Encoder encoder,
final Map<String, Integer> dict) {
return new BCF2FieldWriter.GenericSiteWriter(header, createFieldEncoder(line, encoder, dict, false));
}
private BCF2FieldEncoder createFieldEncoder(final VCFCompoundHeaderLine line,
final BCF2Encoder encoder,
final Map<String, Integer> dict,
final boolean createGenotypesEncoders ) {
if ( createGenotypesEncoders && intGenotypeFieldAccessors.getAccessor(line.getID()) != null ) {
if ( line.getType() != VCFHeaderLineType.Integer )
logger.warn("Warning: field " + line.getID() + " expected to encode an integer but saw " + line.getType() + " for record " + line);
return new BCF2FieldEncoder.IntArray(line, encoder, dict);
} else if ( createGenotypesEncoders && line.getID().equals(VCFConstants.GENOTYPE_KEY) ) {
return new BCF2FieldEncoder.IntList(line, encoder, dict);
} else {
switch ( line.getType() ) {
case Character:
case String:
return new BCF2FieldEncoder.StringOrCharacter(line, encoder, dict);
case Flag:
return new BCF2FieldEncoder.Flag(line, encoder, dict);
case Float:
return new BCF2FieldEncoder.Float(line, encoder, dict);
case Integer:
if ( line.getCountType() == VCFHeaderLineCount.INTEGER && line.getCount() == 1 )
return new BCF2FieldEncoder.AtomicInt(line, encoder, dict);
else
return new BCF2FieldEncoder.IntList(line, encoder, dict);
default:
throw new ReviewedStingException("Unexpected type for field " + line.getID());
}
}
}
// -----------------------------------------------------------------
//
// Master routine to look at the header, a specific line, and
// build an appropriate Genotypes for that header element
//
// -----------------------------------------------------------------
private BCF2FieldWriter.GenotypesWriter createGenotypesWriter(final VCFHeader header,
final VCFFormatHeaderLine line,
final BCF2Encoder encoder,
final Map<String, Integer> dict) {
final String field = line.getID();
final BCF2FieldEncoder fieldEncoder = createFieldEncoder(line, encoder, dict, true);
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
return new BCF2FieldWriter.GTWriter(header, fieldEncoder);
} else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) {
return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field));
} else if ( line.getType() == VCFHeaderLineType.Integer ) {
return new BCF2FieldWriter.IntegerTypeGenotypesWriter(header, fieldEncoder);
} else {
return new BCF2FieldWriter.FixedTypeGenotypesWriter(header, fieldEncoder);
}
}
// -----------------------------------------------------------------
//
// Accessors to get site / genotype writers
//
// -----------------------------------------------------------------
public BCF2FieldWriter.SiteWriter getSiteFieldWriter(final String key) {
final BCF2FieldWriter.SiteWriter writer = siteWriters.get(key);
return getWriter(key, siteWriters);
}
public BCF2FieldWriter.GenotypesWriter getGenotypeFieldWriter(final String key) {
return getWriter(key, genotypesWriters);
}
public <T> T getWriter(final String key, final Map<String, T> map) {
final T writer = map.get(key);
if ( writer == null ) throw new ReviewedStingException("BUG: no writer found for " + key);
return writer;
}

View File

@ -50,7 +50,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
private final boolean doNotWriteGenotypes;
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
@ -233,251 +232,32 @@ class BCF2Writer extends IndexingVariantContextWriter {
writer.start(encoder, vc);
writer.site(encoder, vc);
writer.done(encoder, vc);
// the old way of doing things
// final VCFToBCFEncoding encoding = prepFieldValueForEncoding(key, infoFieldEntry.getValue());
// encodeStringByRef(key);
// encoder.encodeTyped(encoding.valuesToEncode, encoding.BCF2Type);
}
}
// --------------------------------------------------------------------------------
//
// Type matching routines between VCF and BCF
//
// --------------------------------------------------------------------------------
private final class VCFToBCFEncoding {
VCFHeaderLineType vcfType;
BCF2Type BCF2Type;
List<? extends Object> valuesToEncode;
private VCFToBCFEncoding(final VCFHeaderLineType vcfType, final BCF2Type BCF2Type, final List<? extends Object> valuesToEncode) {
this.vcfType = vcfType;
this.BCF2Type = BCF2Type;
this.valuesToEncode = valuesToEncode;
}
}
// TODO TODO TODO TODO TODO
// TODO
// TODO -- we really need explicit converters as first class objects
// TODO -- need to generalize so we can enable vectors of compressed genotype ints
// TODO -- no sense in allocating these over and over
// TODO
// TODO TODO TODO TODO TODO
private final VCFToBCFEncoding prepFieldValueForEncoding(final String field, final Object value) {
final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, field);
final boolean isList = value instanceof List;
final Object toType = isList ? ((List)value).get(0) : value;
try {
switch ( metaData.getType() ) {
case Character:
assert toType instanceof String;
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(value));
case Flag:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1));
case String:
final String s = isList ? BCF2Utils.collapseStringList((List<String>)value) : (String)value;
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(s));
case Integer: // note integer calculation is a bit complex because of the need to determine sizes
List<Integer> l;
BCF2Type intType;
if ( isList ) {
l = (List<Integer>)value;
intType = BCF2Utils.determineIntegerType(l);
} else if ( value != null ) {
intType = BCF2Utils.determineIntegerType((Integer) value);
l = Collections.singletonList((Integer)value);
} else {
intType = BCF2Type.INT8;
l = Collections.singletonList((Integer) null);
}
return new VCFToBCFEncoding(metaData.getType(), intType, l);
case Float:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.FLOAT, isList ? (List<Double>)value : Collections.singletonList(value));
default:
throw new ReviewedStingException("Unexpected type for field " + field);
}
} catch ( ClassCastException e ) {
throw new ReviewedStingException("Error computing VCF -> BCF encoding. Received cast class exception"
+ " indicating that the VCF header for " + metaData + " is inconsistent with the" +
" value seen in the VariantContext object = " + value, e);
}
}
// --------------------------------------------------------------------------------
//
// Genotype field encoding
//
// --------------------------------------------------------------------------------
private byte[] buildSamplesData(final VariantContext vc) throws IOException {
BCF2Codec.LazyData lazyData = getLazyData(vc);
final BCF2Codec.LazyData lazyData = getLazyData(vc);
if ( lazyData != null ) {
// we never decoded any data from this BCF file, so just pass it back
return lazyData.bytes;
} else {
// we have to do work to convert the VC into a BCF2 byte stream
List<String> genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header);
final List<String> genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header);
for ( final String field : genotypeFields ) {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
addGenotypes(vc);
} else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) {
addIntGenotypeField(field, intGenotypeFieldAccessors.getAccessor(field), vc);
} else {
addGenericGenotypeField(vc, field);
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
writer.start(encoder, vc);
for ( final String name : header.getGenotypeSamples() ) {
// todo -- can we optimize this get (string -> genotype) which can be expensive
final Genotype g = vc.getGenotype(name);
writer.addGenotype(encoder, vc, g);
}
writer.done(encoder, vc);
}
return encoder.getRecordBytes();
}
}
private final int getNGenotypeFieldValues(final String field, final VariantContext vc, final IntGenotypeFieldAccessors.Accessor ige) {
final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, field);
assert metaData != null; // field is supposed to be in header
int nFields = metaData.getCount(vc.getNAlleles() - 1);
if ( nFields == -1 ) { // unbounded, need to look at values
return computeMaxSizeOfGenotypeFieldFromValues(field, vc, ige);
} else {
return nFields;
}
}
private final int computeMaxSizeOfGenotypeFieldFromValues(final String field, final VariantContext vc, final IntGenotypeFieldAccessors.Accessor ige) {
int size = -1;
final GenotypesContext gc = vc.getGenotypes();
if ( ige == null ) {
for ( final Genotype g : gc ) {
final Object o = g.getAttribute(field);
if ( o == null ) continue;
if ( o instanceof List ) {
// only do compute if first value is of type list
size = Math.max(size, ((List)o).size());
} else if ( size == -1 )
size = 1;
}
} else {
for ( final Genotype g : gc ) {
size = Math.max(size, ige.getSize(g));
}
}
return size;
}
private final void addGenericGenotypeField(final VariantContext vc, final String field) throws IOException {
final int numInFormatField = getNGenotypeFieldValues(field, vc, null);
final VCFToBCFEncoding encoding = prepFieldValueForEncoding(field, null);
startGenotypeField(field, numInFormatField, encoding.BCF2Type);
for ( final String name : header.getGenotypeSamples() ) {
final Genotype g = vc.getGenotype(name); // todo -- can we optimize this?
try {
final Object fieldValue = g.getAttribute(field);
if ( numInFormatField == 1 ) {
// we encode the actual allele, encodeRawValue handles the missing case where fieldValue == null
encoder.encodeRawValue(fieldValue, encoding.BCF2Type);
} else if ( encoding.vcfType == VCFHeaderLineType.String ) {
// we have multiple strings, so we need to collapse them
throw new ReviewedStingException("LIMITATION: Cannot encode arrays of string values in genotypes");
} else {
// multiple values, need to handle general case
final List<Object> asList = toList(fieldValue);
final int nSampleValues = asList.size();
for ( int i = 0; i < numInFormatField; i++ ) {
encoder.encodeRawValue(i < nSampleValues ? asList.get(i) : null, encoding.BCF2Type);
}
}
} catch ( ClassCastException e ) {
throw new ReviewedStingException("Value stored in VariantContext incompatible with VCF header type for field " + field, e);
}
}
}
// TODO TODO TODO TODO TODO
// TODO
// TODO THIS ROUTINE NEEDS TO BE OPTIMIZED. IT ACCOUNTS FOR A SIGNIFICANT AMOUNT OF THE
// TODO RUNTIME FOR WRITING OUT BCF FILES WITH MANY GENOTYPES
// TODO
// TODO TODO TODO TODO TODO
private final void addIntGenotypeField(final String field,
final IntGenotypeFieldAccessors.Accessor ige,
final VariantContext vc) throws IOException {
final int numPLs = getNGenotypeFieldValues(field, vc, ige);
final int[] allPLs = new int[numPLs * vc.getNSamples()];
// collect all of the PLs into a single vector of values
int i = 0;
for ( final String name : header.getGenotypeSamples() ) {
final Genotype g = vc.getGenotype(name); // todo -- can we optimize this?
final int[] pls = ige.getValues(g);
if ( pls == null )
for ( int j = 0; j < numPLs; j++) allPLs[i++] = -1;
else {
assert pls.length == numPLs;
for ( int pl : pls ) allPLs[i++] = pl;
}
}
// determine the best size
final BCF2Type type = BCF2Utils.determineIntegerType(allPLs);
startGenotypeField(field, numPLs, type);
for ( int pl : allPLs )
encoder.encodePrimitive(pl == -1 ? type.getMissingBytes() : pl, type);
}
// TODO TODO TODO TODO TODO
// TODO
// TODO we should really have a fast path for encoding diploid genotypes where
// TODO we don't pay the overhead of creating the allele maps
// TODO
// TODO TODO TODO TODO TODO
private final void addGenotypes(final VariantContext vc) throws IOException {
if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES )
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
"with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have "
+ vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart());
final Map<Allele, Integer> alleleMap = buildAlleleMap(vc);
final int maxPloidy = vc.getMaxPloidy();
startGenotypeField(VCFConstants.GENOTYPE_KEY, maxPloidy, BCF2Type.INT8);
for ( final String name : header.getGenotypeSamples() ) {
final Genotype g = vc.getGenotype(name); // todo -- can we optimize this?
final List<Allele> alleles = g.getAlleles();
final int samplePloidy = alleles.size();
for ( int i = 0; i < maxPloidy; i++ ) {
if ( i < samplePloidy ) {
// we encode the actual allele
final Allele a = alleles.get(i);
final int offset = alleleMap.get(a);
final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00);
encoder.encodePrimitive(encoded, BCF2Type.INT8);
} else {
// we need to pad with missing as we have ploidy < max for this sample
encoder.encodePrimitive(BCF2Type.INT8.getMissingBytes(), BCF2Type.INT8);
}
}
}
}
private final static Map<Allele, Integer> buildAlleleMap(final VariantContext vc) {
final Map<Allele, Integer> alleleMap = new HashMap<Allele, Integer>(vc.getAlleles().size()+1);
alleleMap.put(Allele.NO_CALL, -1); // convenience for lookup
final List<Allele> alleles = vc.getAlleles();
for ( int i = 0; i < alleles.size(); i++ ) {
alleleMap.put(alleles.get(i), i);
}
return alleleMap;
}
// --------------------------------------------------------------------------------
//
// Low-level block encoding
@ -499,16 +279,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
outputStream.write(genotypesBlock);
}
@Requires("string != null")
@Ensures("BCF2Type.INTEGERS.contains(result)")
private final BCF2Type encodeStringByRef(final String string) throws IOException {
final Integer offset = stringDictionaryMap.get(string);
if ( offset == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF");
final BCF2Type type = BCF2Utils.determineIntegerType(offset);
encoder.encodeTyped(offset, type);
return type;
}
@Requires("! strings.isEmpty()")
@Ensures("BCF2Type.INTEGERS.contains(result)")
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
@ -540,29 +310,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
return maxType;
}
@Requires({"key != null", "! key.equals(\"\")", "size >= 0"})
private final void startGenotypeField(final String key, final int size, final BCF2Type valueType) throws IOException {
encodeStringByRef(key);
encoder.encodeType(size, valueType);
}
/**
* Helper function that takes an object and returns a list representation
* of it:
*
* o == null => []
* o is a list => o
* else => [o]
*
* @param o
* @return
*/
private final static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
/**
* Create the contigDictionary from the contigLines extracted from the VCF header
*

View File

@ -29,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("bf5c76bec6e00199d441b6175b6b7c39"));
Arrays.asList("40dae29480e26efb117344e0e82e997c"));
executeTest("test MultiSample Pilot1", spec);
}
@ -37,7 +37,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("9f56f8d62c047213c894c3f250706aea"));
Arrays.asList("96b9f6005d9437bf527344cf98319739"));
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
}
@ -45,7 +45,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("0a5048062cd9022b761ae87efed5957e"));
Arrays.asList("02c5c466ad26104b64166beb788a618c"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -53,7 +53,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("f50a30bf9bbd4e5dcd5d7d9282b6dadf"));
Arrays.asList("7cc7ec249243a0dcb9e3a3033d4704e4"));
executeTest("test SingleSample Pilot2", spec);
}
@ -61,7 +61,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + testDir + "multiallelic.snps.bam -o %s -L " + testDir + "multiallelic.snps.intervals", 1,
Arrays.asList("6fb6ea5f2b9da02a0fea7cb2994fb5db"));
Arrays.asList("c0416efb6ff481889542e7807baa495c"));
executeTest("test Multiple SNP alleles", spec);
}
@ -69,7 +69,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testBadRead() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH -I " + testDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
Arrays.asList("95158fb50db5d41a678cd331a3ffe5e1"));
Arrays.asList("e2cf97bca4a720ca64ca7f682da6c9f0"));
executeTest("test bad read", spec);
}
@ -77,7 +77,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("c86e05f315a86bc190d72cde911e6fe2"));
Arrays.asList("976694d7aedd0272a3e2ba46e00581f5"));
executeTest("test reverse trim", spec);
}
@ -87,7 +87,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "f6d655714706b6e8390037db3fad60ef";
private final static String COMPRESSED_OUTPUT_MD5 = "c0f3d88386367fcd66d196b49a22bfaa";
@Test
public void testCompressedOutput() {
@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "7bc812cc553b4ab77c08049f0e32d0f6";
String md5 = "34cb7146c037925e8f324cffd986834d";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@ -140,7 +140,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinBaseQualityScore() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
Arrays.asList("dfeaccb68165fdaffafde9150914432d"));
Arrays.asList("ac4c8e0fdaf94a7949ceb157d3c836a9"));
executeTest("test min_base_quality_score 26", spec);
}
@ -148,7 +148,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("35ef19b4f248969c74da8bd7489385d6"));
Arrays.asList("751cfe684b0ed19b0d509bb7a315d65d"));
executeTest("test SLOD", spec);
}
@ -156,7 +156,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNDA() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("aa49989fde8c6378f5c751f8b267c471"));
Arrays.asList("cbf1b8b261f726abe43045806e17e841"));
executeTest("test NDA", spec);
}
@ -164,23 +164,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testCompTrack() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("ffaeb60a5776d85b41c64786ddc4d14d"));
Arrays.asList("70fd3b551f9def332393b7f3976440ac"));
executeTest("test using comp track", spec);
}
@Test
public void testOutputParameterSitesOnly() {
testOutputParameters("-sites_only", "f9a4005c53291170800e6023503d5635");
testOutputParameters("-sites_only", "9700208609431548b2651dc582f1ea10");
}
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "e6c63baff51aaeb318c8bebaf2989828");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "b7bfc2a5298f86e3aef1bb70bebfb38e");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "43ffa34646d781a368ea81342c21ae2e");
testOutputParameters("--output_mode EMIT_ALL_SITES", "90dcce8cd6de92258f332b261e64a5e9");
}
private void testOutputParameters(final String args, final String md5) {
@ -194,7 +194,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("c7cb29121eb30e752ab6652a6d2a62a6"));
Arrays.asList("c5311739d1bba16cd59fb33e5f92ff49"));
executeTest("test confidence 1", spec1);
}
@ -202,7 +202,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
Arrays.asList("e7bdb76be82420a03ff28038d283822d"));
Arrays.asList("189f5b7026595a0814e1eb1466834760"));
executeTest("test confidence 2", spec2);
}
@ -213,12 +213,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "ca65e199e9ff0bc986df3dee74e11eb1" );
testHeterozosity( 0.01, "a818570e33316086ad5cb95992faca69" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "ddcdfe4a5252da59278a6f1ba6f8a175" );
testHeterozosity( 1.0 / 1850, "3bfdba9bcc9c84d326750ad4a879897f" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -242,7 +242,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("c4b3876d76e3d0fb78a1d3ebd674f1a1"));
Arrays.asList("1bd18653e8d9bf72c180142c2287eb5e"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -261,7 +261,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("41445b1cd1a82af71126ff1692f7a5fe"));
Arrays.asList("1defb30f6b870fbe5bcb27aed22ffb04"));
executeTest(String.format("test calling with BAQ"), spec);
}
@ -280,7 +280,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("c9e79470a4ce6eacde366e9fcf4d5b14"));
Arrays.asList("2d52f5243da578c4070b03881852aae3"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -295,7 +295,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("70f8a17ba68131520db5c764ac5acdd2"));
Arrays.asList("9271105e630ab39cf1c88b338da54594"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -308,7 +308,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("e4316d80fd833886820c8b4e122fbfc4"));
Arrays.asList("80f0221c47c30a03cb902b1c54c1eb12"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -318,7 +318,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("c92aba3635f3331ddf8ae7a0382ca594"));
Arrays.asList("ce40ff952025e806638156c3cd002927"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -328,7 +328,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ testDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("b87034f349887160ec1124e12863d543"));
Arrays.asList("4738fa9e68a010e05c64ae965ed94de3"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -336,13 +336,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSampleIndels1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("51e6a7868d2ea2daefa411ed82f18be2"));
Arrays.asList("0b0114364680a23e125f469208b96c6f"));
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("954c52be0c6ca9ed5a213a53f4efbc10"));
Arrays.asList("adcf53b8dcfde7f2c657745751549bfe"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -352,7 +352,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + testDir + vcf + " -I " + validationDataLocation +
"NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1,
Arrays.asList("ae44230ed54fd8ce63711cae908470cb"));
Arrays.asList("3e3ac23846801c34acbf10a1a527264a"));
executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec);
}
@ -385,7 +385,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("471012c1d3dbec4633710264de5daa24"));
Arrays.asList("eb9d8d1b19617190762f62bad139263a"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -393,7 +393,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("9165507fb202d515512a947a8a9db6bb"));
Arrays.asList("952f46506d3ef9c11cafcc072a00750a"));
executeTest("test minIndelFraction 0.25", spec);
}
@ -401,7 +401,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction100() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 1", 1,
Arrays.asList("c1bbd4998b7c6dffee1682d3e5c929cc"));
Arrays.asList("50a6774b7d8f71fe0e125c204d50ba84"));
executeTest("test minIndelFraction 1.0", spec);
}
}

View File

@ -156,20 +156,32 @@ public class VariantContextTestProvider {
}
}
private final static void addHeaderLine(final Set<VCFHeaderLine> metaData, final String id, final int count, final VCFHeaderLineType type) {
metaData.add(new VCFInfoHeaderLine(id, count, type, "x"));
if ( type != VCFHeaderLineType.Flag )
metaData.add(new VCFFormatHeaderLine(id, count, type, "x"));
}
private final static void addHeaderLine(final Set<VCFHeaderLine> metaData, final String id, final VCFHeaderLineCount count, final VCFHeaderLineType type) {
metaData.add(new VCFInfoHeaderLine(id, count, type, "x"));
if ( type != VCFHeaderLineType.Flag )
metaData.add(new VCFFormatHeaderLine(id, count, type, "x"));
}
private static void createSyntheticHeader() {
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "x"));
addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String);
addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String);
addHeaderLine(metaData, "STRING20", 20, VCFHeaderLineType.String);
addHeaderLine(metaData, "VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFFormatHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"));
metaData.add(new VCFFormatHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
metaData.add(new VCFFormatHeaderLine("GS", 2, VCFHeaderLineType.String, "A doubleton list of strings"));
metaData.add(new VCFFormatHeaderLine("GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "A variable list of strings"));
metaData.add(new VCFFormatHeaderLine("FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype filters"));
addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String);
addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
addHeaderLine(metaData, "FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
// prep the header
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
@ -177,13 +189,13 @@ public class VariantContextTestProvider {
metaData.add(new VCFFilterHeaderLine("FILTER1"));
metaData.add(new VCFFilterHeaderLine("FILTER2"));
metaData.add(new VCFInfoHeaderLine("INT1", 1, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("INT3", 3, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("INT20", 20, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("FLOAT1", 1, VCFHeaderLineType.Float, "x"));
metaData.add(new VCFInfoHeaderLine("FLOAT3", 3, VCFHeaderLineType.Float, "x"));
metaData.add(new VCFInfoHeaderLine("FLAG", 0, VCFHeaderLineType.Flag, "x"));
addHeaderLine(metaData, "INT1", 1, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "INT3", 3, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "INT20", 20, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "FLOAT1", 1, VCFHeaderLineType.Float);
addHeaderLine(metaData, "FLOAT3", 3, VCFHeaderLineType.Float);
addHeaderLine(metaData, "FLAG", 0, VCFHeaderLineType.Flag);
syntheticHeader = new VCFHeader(metaData);
}