Bugfixes to G count types in VCF header

-- Previously VCF header lines of count type G assumed that the sample would be diploid.
-- Generalized the code to take a VariantContext and return the right result for G count types by calling into the correct numGenotypes in GenotypeLikelihoods class
-- renamed calcNumGenotypes to numGenotypes, which uses a static cache in the class
-- calcNumGenotypes is private, and is used to build the static cache or to compute on the fly for uncached No. allele / ploidy combinations
-- VariantContext calls into getMaxPloidy in GenotypesContext, which caches the max ploidy among samples
-- Added extensive unit tests that compare A and G type values in genotypes
This commit is contained in:
Mark DePristo 2012-06-26 15:28:17 -04:00
parent 7ef5ce28cc
commit 1f45551a15
11 changed files with 176 additions and 93 deletions

View File

@ -28,6 +28,8 @@ import org.apache.log4j.Logger;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.LinkedHashMap;
@ -67,14 +69,25 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
return count;
}
// utility method
public int getCount(int numAltAlleles) {
/**
* Get the number of values expected for this header field, given the properties of VariantContext vc
*
* If the count is a fixed count, return that. For example, a field with size of 1 in the header returns 1
* If the count is of type A, return vc.getNAlleles - 1
* If the count is of type G, return the expected number of genotypes given the number of alleles in VC and the
* max ploidy among all samples
* If the count is UNBOUNDED return -1
*
* @param vc
* @return
*/
public int getCount(final VariantContext vc) {
int myCount;
switch ( countType ) {
case INTEGER: myCount = count; break;
case UNBOUNDED: myCount = -1; break;
case A: myCount = numAltAlleles; break;
case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break;
case A: myCount = vc.getNAlleles() - 1; break;
case G: myCount = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), vc.getMaxPloidy()); break;
default: throw new ReviewedStingException("Unknown count type: " + countType);
}
return myCount;

View File

@ -554,7 +554,7 @@ public abstract class Genotype implements Comparable<Genotype> {
pairs.add(k + "=" + c.get(k));
}
return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}";
return pairs.isEmpty() ? "" : " {" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}";
}
/**

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.variantcontext;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
@ -34,6 +36,11 @@ import java.util.Arrays;
import java.util.EnumMap;
public class GenotypeLikelihoods {
private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5;
private final static int NUM_LIKELIHOODS_CACHE_PLOIDY = 10;
// caching numAlleles up to 5 and ploidy up to 10
private final static int[][] numLikelihoodCache = new int[NUM_LIKELIHOODS_CACHE_N_ALLELES][NUM_LIKELIHOODS_CACHE_PLOIDY];
public final static int MAX_PL = Short.MAX_VALUE;
//
@ -44,6 +51,30 @@ public class GenotypeLikelihoods {
private double[] log10Likelihoods = null;
private String likelihoodsAsString_PLs = null;
/**
* initialize num likelihoods cache
*/
static {
// must be done before PLIndexToAlleleIndex
for ( int numAlleles = 1; numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES; numAlleles++ ) {
//numLikelihoodCache[numAlleles] = new int[NUM_LIKELIHOODS_CACHE_PLOIDY];
for ( int ploidy = 1; ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY; ploidy++ ) {
numLikelihoodCache[numAlleles][ploidy] = calcNumLikelihoods(numAlleles, ploidy);
}
}
}
/**
* The maximum number of alleles that we can represent as genotype likelihoods
*/
public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50;
/*
* a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
*/
private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
public final static GenotypeLikelihoods fromPLField(String PLs) {
return new GenotypeLikelihoods(PLs);
}
@ -245,47 +276,11 @@ public class GenotypeLikelihoods {
return likelihoodsAsVector;
}
// // -------------------------------------------------------------------------------------
// //
// // List interface functions
// //
// // -------------------------------------------------------------------------------------
//
// private final void notImplemented() {
// throw new ReviewedStingException("BUG: code not implemented");
// }
//
// @Override public int size() { return getAsVector().length; }
// @Override public Double get(final int i) { return getAsVector()[i];}
// @Override public Double set(final int i, final Double aDouble) { return getAsVector()[i] = aDouble; }
// @Override public boolean isEmpty() { return false; }
// @Override public Iterator<Double> iterator() { return Arrays.asList(ArrayUtils.toObject(getAsVector())).iterator(); }
// @Override public Object[] toArray() { return ArrayUtils.toObject(getAsVector()); }
//
// // none of these are implemented
// @Override public boolean contains(final Object o) { notImplemented(); return false; }
// @Override public <T> T[] toArray(final T[] ts) { notImplemented(); return null; }
// @Override public boolean add(final Double aDouble) { notImplemented(); return false; }
// @Override public boolean remove(final Object o) {notImplemented(); return false; }
// @Override public boolean containsAll(final Collection<?> objects) { notImplemented(); return false; }
// @Override public boolean addAll(final Collection<? extends Double> doubles) { notImplemented(); return false; }
// @Override public boolean addAll(final int i, final Collection<? extends Double> doubles) { notImplemented(); return false; }
// @Override public boolean removeAll(final Collection<?> objects) { notImplemented(); return false; }
// @Override public boolean retainAll(final Collection<?> objects) { notImplemented(); return false; }
// @Override public void clear() { notImplemented(); }
// @Override public void add(final int i, final Double aDouble) { notImplemented(); }
// @Override public Double remove(final int i) { notImplemented(); return null; }
// @Override public int indexOf(final Object o) { notImplemented(); return -1; }
// @Override public int lastIndexOf(final Object o) { notImplemented(); return 0; }
// @Override public ListIterator<Double> listIterator() { notImplemented(); return null; }
// @Override public ListIterator<Double> listIterator(final int i) { notImplemented(); return null; }
// @Override public List<Double> subList(final int i, final int i1) { notImplemented(); return null; }
// -------------------------------------------------------------------------------------
//
// Static conversion utilities, going from GL/PL index to allele index and vice versa.
//
// -------------------------------------------------------------------------------------
// -------------------------------------------------------------------------------------
//
// Static conversion utilities, going from GL/PL index to allele index and vice versa.
//
// -------------------------------------------------------------------------------------
/*
* Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index.
@ -300,18 +295,8 @@ public class GenotypeLikelihoods {
}
}
/**
* The maximum number of alleles that we can represent as genotype likelihoods
*/
public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50;
/*
* a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
*/
private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) {
final int numLikelihoods = calculateNumLikelihoods(1+altAlleles, 2);
final int numLikelihoods = numLikelihoods(1 + altAlleles, 2);
final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods];
// for all possible combinations of 2 alleles
@ -330,6 +315,32 @@ public class GenotypeLikelihoods {
return cache;
}
// -------------------------------------------------------------------------------------
//
// num likelihoods given number of alleles and ploidy
//
// -------------------------------------------------------------------------------------
/**
* Actually does the computation in @see #numLikelihoods
*
* @param numAlleles
* @param ploidy
* @return
*/
private static final int calcNumLikelihoods(final int numAlleles, final int ploidy) {
if (numAlleles == 1)
return 1;
else if (ploidy == 1)
return numAlleles;
else {
int acc =0;
for (int k=0; k <= ploidy; k++ )
acc += calcNumLikelihoods(numAlleles - 1, ploidy - k);
return acc;
}
}
/**
* Compute how many likelihood elements are associated with the given number of alleles
* Equivalent to asking in how many ways N non-negative integers can add up to P is S(N,P)
@ -344,6 +355,8 @@ public class GenotypeLikelihoods {
* which is then, for ordering above, (2,0,0), (1,1,0), (0,2,0), (1,1,0), (0,1,1), (0,0,2)
* In general, for P=2 (regular biallelic), then S(N,2) = N*(N+1)/2
*
* Note this method caches the value for most common num Allele / ploidy combinations for efficiency
*
* Recursive implementation:
* S(N,P) = sum_{k=0}^P S(N-1,P-k)
* because if we have N integers, we can condition 1 integer to be = k, and then N-1 integers have to sum to P-K
@ -355,23 +368,16 @@ public class GenotypeLikelihoods {
* @param ploidy Ploidy, or number of chromosomes in set
* @return Number of likelihood elements we need to hold.
*/
public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) {
// fast, closed form solution for diploid samples (most common use case)
if (ploidy==2)
return numAlleles*(numAlleles+1)/2;
if (numAlleles == 1)
return 1;
else if (ploidy == 1)
return numAlleles;
int acc =0;
for (int k=0; k <= ploidy; k++ )
acc += calculateNumLikelihoods(numAlleles-1, ploidy-k);
return acc;
@Requires({"ploidy > 0", "numAlleles > 0"})
@Ensures("result > 0")
public static int numLikelihoods(final int numAlleles, final int ploidy) {
if ( numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES
&& ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY )
return numLikelihoodCache[numAlleles][ploidy];
else {
// have to calculate on the fly
return calcNumLikelihoods(numAlleles, ploidy);
}
}
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.

View File

@ -61,6 +61,11 @@ public class GenotypesContext implements List<Genotype> {
*/
ArrayList<Genotype> notToBeDirectlyAccessedGenotypes;
/**
* Cached value of the maximum ploidy observed among all samples
*/
private int maxPloidy = -1;
/** Are we allowing users to modify the list? */
boolean immutable = false;
@ -408,6 +413,16 @@ public class GenotypesContext implements List<Genotype> {
return getGenotypes().get(i);
}
@Ensures("result >= 0")
public int getMaxPloidy() {
if ( maxPloidy == -1 ) {
for ( final Genotype g : getGenotypes() ) {
maxPloidy = Math.max(g.getPloidy(), maxPloidy);
}
}
return maxPloidy;
}
/**
* Gets sample associated with this sampleName, or null if none is found
*

View File

@ -626,14 +626,13 @@ public class VariantContext implements Feature { // to enable tribble integratio
/**
* Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes
*
* This function is caching, so it's only expensive on the first call
*
* @return -1, or the max ploidy
*/
public int getMaxPloidy() {
int max = -1;
for ( final Genotype g : getGenotypes() ) {
max = Math.max(g.getPloidy(), max);
}
return max;
return genotypes.getMaxPloidy();
}
/**
@ -1381,7 +1380,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
&& format.getCountType() != VCFHeaderLineCount.UNBOUNDED
&& format.getType() != VCFHeaderLineType.Flag ) { // we expect exactly the right number of elements
final int obsSize = decoded instanceof List ? ((List) decoded).size() : 1;
final int expSize = format.getCount(this.getNAlleles() - 1);
final int expSize = format.getCount(this);
if ( obsSize != expSize ) {
throw new UserException.MalformedVCFHeader("Discordant field size detected for field " +
field + " at " + getChr() + ":" + getStart() + ". Field had " + obsSize + " values " +

View File

@ -159,16 +159,20 @@ public class VariantContextBuilder {
return this;
}
public VariantContextBuilder alleles(final String ... alleleStrings) {
List<Allele> alleles = new ArrayList<Allele>(alleleStrings.length);
public VariantContextBuilder alleles(final List<String> alleleStrings) {
List<Allele> alleles = new ArrayList<Allele>(alleleStrings.size());
for ( int i = 0; i < alleleStrings.length; i++ ) {
alleles.add(Allele.create(alleleStrings[i], i == 0));
for ( int i = 0; i < alleleStrings.size(); i++ ) {
alleles.add(Allele.create(alleleStrings.get(i), i == 0));
}
return alleles(alleles);
}
public VariantContextBuilder alleles(final String ... alleleStrings) {
return alleles(Arrays.asList(alleleStrings));
}
public List<Allele> getAlleles() {
return new ArrayList<Allele>(alleles);
}

View File

@ -1199,8 +1199,8 @@ public class VariantContextUtils {
altAlleleIndexToUse[i] = true;
}
// calculateNumLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, DEFAULT_PLOIDY);
// numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2
final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(1 + numOriginalAltAlleles, DEFAULT_PLOIDY);
for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// consider this entry only if both of the alleles are good

View File

@ -185,7 +185,7 @@ public abstract class BCF2FieldEncoder {
@Requires("hasContextDeterminedNumElements()")
@Ensures("result >= 0")
public int numElements(final VariantContext vc) {
return headerLine.getCount(vc.getNAlleles() - 1);
return headerLine.getCount(vc);
}
/**

View File

@ -402,7 +402,7 @@ class VCFWriter extends IndexingVariantContextWriter {
VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field);
if ( metaData != null ) {
int numInFormatField = metaData.getCount(vc.getAlternateAlleles().size());
int numInFormatField = metaData.getCount(vc);
if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
// If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
// For example, if Number=2, the string has to be ".,."

View File

@ -100,10 +100,10 @@ public class GenotypeLikelihoodsUnitTest {
for (int nAlleles=2; nAlleles<=5; nAlleles++)
// simplest case: diploid
Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2);
Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2);
// some special cases: ploidy = 20, #alleles = 4
Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(4, 20), 1771);
Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(4, 20), 1771);
}
@Test

View File

@ -51,6 +51,8 @@ import java.util.*;
public class VariantContextTestProvider {
final protected static Logger logger = Logger.getLogger(VariantContextTestProvider.class);
final private static boolean ENABLE_GENOTYPE_TESTS = true;
final private static boolean ENABLE_A_AND_G_TESTS = true;
final private static boolean ENABLE_VARARRAY_TESTS = true;
final private static boolean ENABLE_PLOIDY_TESTS = true;
final private static boolean ENABLE_PL_TESTS = true;
@ -180,6 +182,7 @@ public class VariantContextTestProvider {
addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.String);
addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "ADA", VCFHeaderLineCount.A, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String);
addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
@ -268,9 +271,13 @@ public class VariantContextTestProvider {
add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2", "s3")));
add(builder().attribute("VAR.INFO.STRING", null));
addGenotypesToTestData();
if ( ENABLE_GENOTYPE_TESTS ) {
addGenotypesToTestData();
addComplexGenotypesTest();
}
addComplexGenotypesTest();
if ( ENABLE_A_AND_G_TESTS )
addGenotypesAndGTests();
}
private static void addGenotypesToTestData() {
@ -315,7 +322,6 @@ public class VariantContextTestProvider {
}
}
private static void addGenotypes( final VariantContext site) {
// test ref/ref
final Allele ref = site.getReference();
@ -516,6 +522,46 @@ public class VariantContextTestProvider {
}
}
private static void addGenotypesAndGTests() {
// for ( final int ploidy : Arrays.asList(2)) {
for ( final int ploidy : Arrays.asList(1, 2, 3, 4, 5)) {
final List<List<String>> alleleCombinations =
Arrays.asList(
Arrays.asList("A"),
Arrays.asList("A", "C"),
Arrays.asList("A", "C", "G"),
Arrays.asList("A", "C", "G", "T"));
for ( final List<String> alleles : alleleCombinations ) {
final VariantContextBuilder vcb = builder().alleles(alleles);
final VariantContext site = vcb.make();
final int nAlleles = site.getNAlleles();
final Allele ref = site.getReference();
// base genotype is ref/.../ref up to ploidy
final List<Allele> baseGenotype = new ArrayList<Allele>(ploidy);
for ( int i = 0; i < ploidy; i++) baseGenotype.add(ref);
final int nPLs = GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy);
// ada is 0, 1, ..., nAlleles - 1
final List<Integer> ada = new ArrayList<Integer>(nAlleles);
for ( int i = 0; i < nAlleles - 1; i++ ) ada.add(i);
// pl is 0, 1, ..., up to nPLs (complex calc of nAlleles and ploidy)
final int[] pl = new int[nPLs];
for ( int i = 0; i < pl.length; i++ ) pl[i] = i;
final GenotypeBuilder gb = new GenotypeBuilder("ADA_PL_SAMPLE");
gb.alleles(baseGenotype);
gb.PL(pl);
gb.attribute("ADA", nAlleles == 2 ? ada.get(0) : ada);
vcb.genotypes(gb.make());
add(vcb);
}
}
}
private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) {
if ( value.length == 0 )
return GenotypeBuilder.create(name, Arrays.asList(ref, ref));