Edge case fixes for BCF2

--handle entirely missing GT in a sample in decodeGenotypeAlleles
--Create MAX_ALLELES_IN_GENOTYPES constant in BCF2Utils, and extracted its use inline from the code
-- Generalized genotype writing code to handle ploidy != 2 and variable ploidy among samples
-- Remove special case inline treatment of case where all samples have no GT field values, and moved this into calcVCFGenotypeKeys
-- Removed restriction on getPloidy requiring ploidy > 1.  It's logically find to return 0 for a no called sample
-- getMaxPloidy() in VC that does what it says
-- Support for padding / depadding of generic genotype fields
This commit is contained in:
Mark DePristo 2012-05-18 15:49:23 -04:00
parent 40431890be
commit c8ed0bfc4c
8 changed files with 94 additions and 53 deletions

View File

@ -365,19 +365,26 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
}
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) // absent, as are all following by definition
return gt;
else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
if ( encoded == null )
// no called sample GT = .
return Collections.emptyList();
else {
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) // absent, as are all following by definition
return gt;
else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
}
return gt;
}
return gt;
}
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) {

View File

@ -145,9 +145,11 @@ public class BCF2Decoder {
} else {
final ArrayList<Object> ints = new ArrayList<Object>(size);
for ( int i = 0; i < size; i++ ) {
ints.add(decodeSingleValue(type));
final Object val = decodeSingleValue(type);
if ( val == null ) continue; // auto-pruning. We remove trailing nulls
ints.add(val);
}
return ints.get(0) == null ? null : ints; // return null when all of the values are null
return ints.isEmpty() ? null : ints; // return null when all of the values are null
}
}

View File

@ -47,6 +47,8 @@ import java.util.List;
public class BCF2Utils {
public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
public static final int MAX_ALLELES_IN_GENOTYPES = 127;
public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14;

View File

@ -108,11 +108,9 @@ public class Genotype implements Comparable<Genotype> {
public boolean isPhased() { return isPhased; }
/**
* @return the ploidy of this genotype
* @return the ploidy of this genotype. 0 if the site is no-called.
*/
public int getPloidy() {
if ( alleles.size() == 0 )
throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype");
return alleles.size();
}

View File

@ -586,6 +586,18 @@ public class VariantContext implements Feature { // to enable tribble integratio
return alleles.size();
}
/**
* Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes
* @return -1, or the max ploidy
*/
public int getMaxPloidy() {
int max = -1;
for ( final Genotype g : getGenotypes() ) {
max = Math.max(g.getPloidy(), max);
}
return max;
}
/**
* @return The allele sharing the same bases as this String. A convenience method; better to use byte[]
*/

View File

@ -246,17 +246,9 @@ class BCF2Writer extends IndexingVariantContextWriter {
if ( o == null ) continue;
if ( o instanceof List ) {
// only do compute if first value is of type list
final List values = (List)g.getAttribute(field);
if ( values != null ) {
if ( values.size() != size && size != -1 )
throw new ReviewedStingException("BUG: BCF2 codec doesn't currently support padding " +
"/ depadding of genotype fields with mixed length." +
"Occurred in field " + field + " at " + vc.getChr() + ":" + vc.getStart());
size = Math.max(size, values.size());
}
} else {
return 1;
}
size = Math.max(size, ((List)o).size());
} else if ( size == -1 )
size = 1;
}
return size;
@ -268,20 +260,28 @@ class BCF2Writer extends IndexingVariantContextWriter {
startGenotypeField(field, numInFormatField, encoding.BCF2Type);
for ( final Genotype g : vc.getGenotypes() ) {
if ( ! g.hasAttribute(field) ) {
encoder.encodeRawMissingValues(numInFormatField, encoding.BCF2Type);
final Object fieldValue = g.getAttribute(field);
if ( numInFormatField == 1 ) {
// we encode the actual allele, encodeRawValue handles the missing case where fieldValue == null
encoder.encodeRawValue(fieldValue, encoding.BCF2Type);
} else {
final Object val = g.getAttribute(field);
if ( (val instanceof List) && (((List) val).size() != numInFormatField ))
throw new ReviewedStingException("BUG: header for " + field +
" has inconsistent number of values " + numInFormatField +
" compared to values in VariantContext " + ((List) val).size());
final List<Object> vals = numInFormatField == 1 ? Collections.singletonList(val) : (List<Object>)val;
encoder.encodeRawValues(vals, encoding.BCF2Type);
// multiple values, need to handle general case
final List<Object> asList = toList(fieldValue);
final int nSampleValues = asList.size();
for ( int i = 0; i < numInFormatField; i++ ) {
encoder.encodeRawValue(i < nSampleValues ? asList.get(i) : null, encoding.BCF2Type);
}
}
}
}
private final static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
private final class VCFToBCFEncoding {
VCFHeaderLineType vcfType;
BCF2Type BCF2Type;
@ -393,22 +393,29 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
private final void addGenotypes(final VariantContext vc) throws IOException {
if ( vc.getNAlleles() > 127 )
if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES )
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
"with > 127 alleles, but you have " + vc.getNAlleles() + " at "
+ vc.getChr() + ":" + vc.getStart());
"with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have "
+ vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart());
final Map<Allele, String> alleleMap = VCFWriter.buildAlleleMap(vc);
final int requiredPloidy = 2; // TODO -- handle ploidy, will need padding / depadding
startGenotypeField(VCFConstants.GENOTYPE_KEY, requiredPloidy, BCF2Type.INT8);
final int maxPloidy = vc.getMaxPloidy();
startGenotypeField(VCFConstants.GENOTYPE_KEY, maxPloidy, BCF2Type.INT8);
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.getPloidy() != requiredPloidy ) throw new ReviewedStingException("Cannot currently handle non-diploid calls!");
final List<Integer> encoding = new ArrayList<Integer>(requiredPloidy);
for ( final Allele a : g.getAlleles() ) {
final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a));
encoding.add(((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00));
final List<Allele> alleles = g.getAlleles();
final int samplePloidy = alleles.size();
for ( int i = 0; i < maxPloidy; i++ ) {
if ( i < samplePloidy ) {
// we encode the actual allele
final Allele a = alleles.get(i);
final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a));
final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00);
encoder.encodePrimitive(encoded, BCF2Type.INT8);
} else {
// we need to pad with missing as we have ploidy < max for this sample
encoder.encodePrimitive(BCF2Type.INT8.getMissingBytes(), BCF2Type.INT8);
}
}
encoder.encodeRawValues(encoding, BCF2Type.INT8);
}
}

View File

@ -257,9 +257,6 @@ class VCFWriter extends IndexingVariantContextWriter {
List<String> genotypeAttributeKeys = new ArrayList<String>();
if ( vc.hasGenotypes() ) {
genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
} else if ( mHeader.hasGenotypingData() ) {
// this needs to be done in case all samples are no-calls
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
}
if ( genotypeAttributeKeys.size() > 0 ) {
@ -470,6 +467,11 @@ class VCFWriter extends IndexingVariantContextWriter {
return result;
}
/**
* Determine which genotype fields are in use in the genotypes in VC
* @param vc
* @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first
*/
public static List<String> calcVCFGenotypeKeys(VariantContext vc) {
Set<String> keys = new HashSet<String>();
@ -502,7 +504,12 @@ class VCFWriter extends IndexingVariantContextWriter {
sortedList = newList;
}
return sortedList;
if ( sortedList.isEmpty() && vc.hasGenotypes() ) {
// this needs to be done in case all samples are no-calls
return Collections.singletonList(VCFConstants.GENOTYPE_KEY);
} else {
return sortedList;
}
}

View File

@ -46,8 +46,8 @@ import java.util.*;
* @since Date created
*/
public class VariantContextTestProvider {
final private static boolean ENABLE_VARARRAY_TESTS = false;
final private static boolean ENABLE_PLOIDY_TESTS = false;
final private static boolean ENABLE_VARARRAY_TESTS = true;
final private static boolean ENABLE_PLOIDY_TESTS = true;
final private static boolean ENABLE_PL_TESTS = true;
private static VCFHeader syntheticHeader;
final static List<VariantContextTestData> TEST_DATAs = new ArrayList<VariantContextTestData>();
@ -282,6 +282,7 @@ public class VariantContextTestProvider {
final Genotype homVar = new Genotype("homVar", Arrays.asList(alt1, alt1));
addGenotypeTests(site, homRef, het);
addGenotypeTests(site, homRef, het, homVar);
final List<Allele> noCall = new ArrayList<Allele>();
// ploidy
if ( ENABLE_PLOIDY_TESTS ) {
@ -292,6 +293,11 @@ public class VariantContextTestProvider {
addGenotypeTests(site,
new Genotype("dip", Arrays.asList(ref, alt1)),
new Genotype("tet", Arrays.asList(ref, alt1, alt1)));
addGenotypeTests(site,
new Genotype("nocall", noCall),
new Genotype("dip", Arrays.asList(ref, alt1)),
new Genotype("tet", Arrays.asList(ref, alt1, alt1)));
}
}