Edge case fixes for BCF2
--handle entirely missing GT in a sample in decodeGenotypeAlleles --Create MAX_ALLELES_IN_GENOTYPES constant in BCF2Utils, and extracted its use inline from the code -- Generalized genotype writing code to handle ploidy != 2 and variable ploidy among samples -- Remove special case inline treatment of case where all samples have no GT field values, and moved this into calcVCFGenotypeKeys -- Removed restriction on getPloidy requiring ploidy > 1. It's logically find to return 0 for a no called sample -- getMaxPloidy() in VC that does what it says -- Support for padding / depadding of generic genotype fields
This commit is contained in:
parent
40431890be
commit
c8ed0bfc4c
|
|
@ -365,19 +365,26 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
|
|||
}
|
||||
|
||||
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
|
||||
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
|
||||
for ( final Integer encode : encoded ) {
|
||||
if ( encode == null ) // absent, as are all following by definition
|
||||
return gt;
|
||||
else {
|
||||
final int offset = encode >> 1;
|
||||
if ( offset == 0 )
|
||||
gt.add(Allele.NO_CALL);
|
||||
else
|
||||
gt.add(siteAlleles.get(offset - 1));
|
||||
if ( encoded == null )
|
||||
// no called sample GT = .
|
||||
return Collections.emptyList();
|
||||
else {
|
||||
// we have at least some alleles to decode
|
||||
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
|
||||
for ( final Integer encode : encoded ) {
|
||||
if ( encode == null ) // absent, as are all following by definition
|
||||
return gt;
|
||||
else {
|
||||
final int offset = encode >> 1;
|
||||
if ( offset == 0 )
|
||||
gt.add(Allele.NO_CALL);
|
||||
else
|
||||
gt.add(siteAlleles.get(offset - 1));
|
||||
}
|
||||
}
|
||||
|
||||
return gt;
|
||||
}
|
||||
return gt;
|
||||
}
|
||||
|
||||
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) {
|
||||
|
|
|
|||
|
|
@ -145,9 +145,11 @@ public class BCF2Decoder {
|
|||
} else {
|
||||
final ArrayList<Object> ints = new ArrayList<Object>(size);
|
||||
for ( int i = 0; i < size; i++ ) {
|
||||
ints.add(decodeSingleValue(type));
|
||||
final Object val = decodeSingleValue(type);
|
||||
if ( val == null ) continue; // auto-pruning. We remove trailing nulls
|
||||
ints.add(val);
|
||||
}
|
||||
return ints.get(0) == null ? null : ints; // return null when all of the values are null
|
||||
return ints.isEmpty() ? null : ints; // return null when all of the values are null
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,8 @@ import java.util.List;
|
|||
public class BCF2Utils {
|
||||
public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
|
||||
|
||||
public static final int MAX_ALLELES_IN_GENOTYPES = 127;
|
||||
|
||||
public static final int OVERFLOW_ELEMENT_MARKER = 15;
|
||||
public static final int MAX_INLINE_ELEMENTS = 14;
|
||||
|
||||
|
|
|
|||
|
|
@ -108,11 +108,9 @@ public class Genotype implements Comparable<Genotype> {
|
|||
public boolean isPhased() { return isPhased; }
|
||||
|
||||
/**
|
||||
* @return the ploidy of this genotype
|
||||
* @return the ploidy of this genotype. 0 if the site is no-called.
|
||||
*/
|
||||
public int getPloidy() {
|
||||
if ( alleles.size() == 0 )
|
||||
throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype");
|
||||
return alleles.size();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -586,6 +586,18 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
return alleles.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes
|
||||
* @return -1, or the max ploidy
|
||||
*/
|
||||
public int getMaxPloidy() {
|
||||
int max = -1;
|
||||
for ( final Genotype g : getGenotypes() ) {
|
||||
max = Math.max(g.getPloidy(), max);
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The allele sharing the same bases as this String. A convenience method; better to use byte[]
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -246,17 +246,9 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
if ( o == null ) continue;
|
||||
if ( o instanceof List ) {
|
||||
// only do compute if first value is of type list
|
||||
final List values = (List)g.getAttribute(field);
|
||||
if ( values != null ) {
|
||||
if ( values.size() != size && size != -1 )
|
||||
throw new ReviewedStingException("BUG: BCF2 codec doesn't currently support padding " +
|
||||
"/ depadding of genotype fields with mixed length." +
|
||||
"Occurred in field " + field + " at " + vc.getChr() + ":" + vc.getStart());
|
||||
size = Math.max(size, values.size());
|
||||
}
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
size = Math.max(size, ((List)o).size());
|
||||
} else if ( size == -1 )
|
||||
size = 1;
|
||||
}
|
||||
|
||||
return size;
|
||||
|
|
@ -268,20 +260,28 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
|
||||
startGenotypeField(field, numInFormatField, encoding.BCF2Type);
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
if ( ! g.hasAttribute(field) ) {
|
||||
encoder.encodeRawMissingValues(numInFormatField, encoding.BCF2Type);
|
||||
final Object fieldValue = g.getAttribute(field);
|
||||
|
||||
if ( numInFormatField == 1 ) {
|
||||
// we encode the actual allele, encodeRawValue handles the missing case where fieldValue == null
|
||||
encoder.encodeRawValue(fieldValue, encoding.BCF2Type);
|
||||
} else {
|
||||
final Object val = g.getAttribute(field);
|
||||
if ( (val instanceof List) && (((List) val).size() != numInFormatField ))
|
||||
throw new ReviewedStingException("BUG: header for " + field +
|
||||
" has inconsistent number of values " + numInFormatField +
|
||||
" compared to values in VariantContext " + ((List) val).size());
|
||||
final List<Object> vals = numInFormatField == 1 ? Collections.singletonList(val) : (List<Object>)val;
|
||||
encoder.encodeRawValues(vals, encoding.BCF2Type);
|
||||
// multiple values, need to handle general case
|
||||
final List<Object> asList = toList(fieldValue);
|
||||
final int nSampleValues = asList.size();
|
||||
for ( int i = 0; i < numInFormatField; i++ ) {
|
||||
encoder.encodeRawValue(i < nSampleValues ? asList.get(i) : null, encoding.BCF2Type);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final static List<Object> toList(final Object o) {
|
||||
if ( o == null ) return Collections.emptyList();
|
||||
else if ( o instanceof List ) return (List<Object>)o;
|
||||
else return Collections.singletonList(o);
|
||||
}
|
||||
|
||||
private final class VCFToBCFEncoding {
|
||||
VCFHeaderLineType vcfType;
|
||||
BCF2Type BCF2Type;
|
||||
|
|
@ -393,22 +393,29 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
}
|
||||
|
||||
private final void addGenotypes(final VariantContext vc) throws IOException {
|
||||
if ( vc.getNAlleles() > 127 )
|
||||
if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES )
|
||||
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
|
||||
"with > 127 alleles, but you have " + vc.getNAlleles() + " at "
|
||||
+ vc.getChr() + ":" + vc.getStart());
|
||||
"with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have "
|
||||
+ vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart());
|
||||
|
||||
final Map<Allele, String> alleleMap = VCFWriter.buildAlleleMap(vc);
|
||||
final int requiredPloidy = 2; // TODO -- handle ploidy, will need padding / depadding
|
||||
startGenotypeField(VCFConstants.GENOTYPE_KEY, requiredPloidy, BCF2Type.INT8);
|
||||
final int maxPloidy = vc.getMaxPloidy();
|
||||
startGenotypeField(VCFConstants.GENOTYPE_KEY, maxPloidy, BCF2Type.INT8);
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
if ( g.getPloidy() != requiredPloidy ) throw new ReviewedStingException("Cannot currently handle non-diploid calls!");
|
||||
final List<Integer> encoding = new ArrayList<Integer>(requiredPloidy);
|
||||
for ( final Allele a : g.getAlleles() ) {
|
||||
final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a));
|
||||
encoding.add(((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00));
|
||||
final List<Allele> alleles = g.getAlleles();
|
||||
final int samplePloidy = alleles.size();
|
||||
for ( int i = 0; i < maxPloidy; i++ ) {
|
||||
if ( i < samplePloidy ) {
|
||||
// we encode the actual allele
|
||||
final Allele a = alleles.get(i);
|
||||
final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a));
|
||||
final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00);
|
||||
encoder.encodePrimitive(encoded, BCF2Type.INT8);
|
||||
} else {
|
||||
// we need to pad with missing as we have ploidy < max for this sample
|
||||
encoder.encodePrimitive(BCF2Type.INT8.getMissingBytes(), BCF2Type.INT8);
|
||||
}
|
||||
}
|
||||
encoder.encodeRawValues(encoding, BCF2Type.INT8);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -257,9 +257,6 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
List<String> genotypeAttributeKeys = new ArrayList<String>();
|
||||
if ( vc.hasGenotypes() ) {
|
||||
genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
|
||||
} else if ( mHeader.hasGenotypingData() ) {
|
||||
// this needs to be done in case all samples are no-calls
|
||||
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
||||
}
|
||||
|
||||
if ( genotypeAttributeKeys.size() > 0 ) {
|
||||
|
|
@ -470,6 +467,11 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine which genotype fields are in use in the genotypes in VC
|
||||
* @param vc
|
||||
* @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first
|
||||
*/
|
||||
public static List<String> calcVCFGenotypeKeys(VariantContext vc) {
|
||||
Set<String> keys = new HashSet<String>();
|
||||
|
||||
|
|
@ -502,7 +504,12 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
sortedList = newList;
|
||||
}
|
||||
|
||||
return sortedList;
|
||||
if ( sortedList.isEmpty() && vc.hasGenotypes() ) {
|
||||
// this needs to be done in case all samples are no-calls
|
||||
return Collections.singletonList(VCFConstants.GENOTYPE_KEY);
|
||||
} else {
|
||||
return sortedList;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -46,8 +46,8 @@ import java.util.*;
|
|||
* @since Date created
|
||||
*/
|
||||
public class VariantContextTestProvider {
|
||||
final private static boolean ENABLE_VARARRAY_TESTS = false;
|
||||
final private static boolean ENABLE_PLOIDY_TESTS = false;
|
||||
final private static boolean ENABLE_VARARRAY_TESTS = true;
|
||||
final private static boolean ENABLE_PLOIDY_TESTS = true;
|
||||
final private static boolean ENABLE_PL_TESTS = true;
|
||||
private static VCFHeader syntheticHeader;
|
||||
final static List<VariantContextTestData> TEST_DATAs = new ArrayList<VariantContextTestData>();
|
||||
|
|
@ -282,6 +282,7 @@ public class VariantContextTestProvider {
|
|||
final Genotype homVar = new Genotype("homVar", Arrays.asList(alt1, alt1));
|
||||
addGenotypeTests(site, homRef, het);
|
||||
addGenotypeTests(site, homRef, het, homVar);
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
|
||||
// ploidy
|
||||
if ( ENABLE_PLOIDY_TESTS ) {
|
||||
|
|
@ -292,6 +293,11 @@ public class VariantContextTestProvider {
|
|||
addGenotypeTests(site,
|
||||
new Genotype("dip", Arrays.asList(ref, alt1)),
|
||||
new Genotype("tet", Arrays.asList(ref, alt1, alt1)));
|
||||
|
||||
addGenotypeTests(site,
|
||||
new Genotype("nocall", noCall),
|
||||
new Genotype("dip", Arrays.asList(ref, alt1)),
|
||||
new Genotype("tet", Arrays.asList(ref, alt1, alt1)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue