diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 97f3ecd0c..6f9a8f5e6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -28,6 +28,8 @@ import org.apache.log4j.Logger; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; import java.util.LinkedHashMap; @@ -67,14 +69,25 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF return count; } - // utility method - public int getCount(int numAltAlleles) { + /** + * Get the number of values expected for this header field, given the properties of VariantContext vc + * + * If the count is a fixed count, return that. For example, a field with size of 1 in the header returns 1 + * If the count is of type A, return vc.getNAlleles - 1 + * If the count is of type G, return the expected number of genotypes given the number of alleles in VC and the + * max ploidy among all samples + * If the count is UNBOUNDED return -1 + * + * @param vc + * @return + */ + public int getCount(final VariantContext vc) { int myCount; switch ( countType ) { case INTEGER: myCount = count; break; case UNBOUNDED: myCount = -1; break; - case A: myCount = numAltAlleles; break; - case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break; + case A: myCount = vc.getNAlleles() - 1; break; + case G: myCount = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), vc.getMaxPloidy()); break; default: throw new ReviewedStingException("Unknown count type: " + countType); } return myCount; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index f45b0e615..d268aabc6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -554,7 +554,7 @@ public abstract class Genotype implements Comparable { pairs.add(k + "=" + c.get(k)); } - return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; + return pairs.isEmpty() ? "" : " {" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index fa41a3c99..7c745628a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.variantcontext; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -34,6 +36,11 @@ import java.util.Arrays; import java.util.EnumMap; public class GenotypeLikelihoods { + private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5; + private final static int NUM_LIKELIHOODS_CACHE_PLOIDY = 10; + // caching numAlleles up to 5 and ploidy up to 10 + private final static int[][] numLikelihoodCache = new int[NUM_LIKELIHOODS_CACHE_N_ALLELES][NUM_LIKELIHOODS_CACHE_PLOIDY]; + public final static int MAX_PL = Short.MAX_VALUE; // @@ -44,6 +51,30 @@ public class GenotypeLikelihoods { private double[] log10Likelihoods = null; private String likelihoodsAsString_PLs = null; + + /** + * initialize num likelihoods cache + */ + static { + // must be done before PLIndexToAlleleIndex + for ( int numAlleles = 1; numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES; numAlleles++ ) { + //numLikelihoodCache[numAlleles] = new int[NUM_LIKELIHOODS_CACHE_PLOIDY]; + for ( int ploidy = 1; ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY; ploidy++ ) { + numLikelihoodCache[numAlleles][ploidy] = calcNumLikelihoods(numAlleles, ploidy); + } + } + } + + /** + * The maximum number of alleles that we can represent as genotype likelihoods + */ + public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50; + + /* + * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + */ + private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); + public final static GenotypeLikelihoods fromPLField(String PLs) { return new GenotypeLikelihoods(PLs); } @@ -245,47 +276,11 @@ public class GenotypeLikelihoods { return likelihoodsAsVector; } -// // ------------------------------------------------------------------------------------- -// // -// // List interface functions -// // -// // ------------------------------------------------------------------------------------- -// -// private final void notImplemented() { -// throw new ReviewedStingException("BUG: code not implemented"); -// } -// -// @Override public int size() { return getAsVector().length; } -// @Override public Double get(final int i) { return getAsVector()[i];} -// @Override public Double set(final int i, final Double aDouble) { return getAsVector()[i] = aDouble; } -// @Override public boolean isEmpty() { return false; } -// @Override public Iterator iterator() { return Arrays.asList(ArrayUtils.toObject(getAsVector())).iterator(); } -// @Override public Object[] toArray() { return ArrayUtils.toObject(getAsVector()); } -// -// // none of these are implemented -// @Override public boolean contains(final Object o) { notImplemented(); return false; } -// @Override public T[] toArray(final T[] ts) { notImplemented(); return null; } -// @Override public boolean add(final Double aDouble) { notImplemented(); return false; } -// @Override public boolean remove(final Object o) {notImplemented(); return false; } -// @Override public boolean containsAll(final Collection objects) { notImplemented(); return false; } -// @Override public boolean addAll(final Collection doubles) { notImplemented(); return false; } -// @Override public boolean addAll(final int i, final Collection doubles) { notImplemented(); return false; } -// @Override public boolean removeAll(final Collection objects) { notImplemented(); return false; } -// @Override public boolean retainAll(final Collection objects) { notImplemented(); return false; } -// @Override public void clear() { notImplemented(); } -// @Override public void add(final int i, final Double aDouble) { notImplemented(); } -// @Override public Double remove(final int i) { notImplemented(); return null; } -// @Override public int indexOf(final Object o) { notImplemented(); return -1; } -// @Override public int lastIndexOf(final Object o) { notImplemented(); return 0; } -// @Override public ListIterator listIterator() { notImplemented(); return null; } -// @Override public ListIterator listIterator(final int i) { notImplemented(); return null; } -// @Override public List subList(final int i, final int i1) { notImplemented(); return null; } - -// ------------------------------------------------------------------------------------- -// -// Static conversion utilities, going from GL/PL index to allele index and vice versa. -// -// ------------------------------------------------------------------------------------- + // ------------------------------------------------------------------------------------- + // + // Static conversion utilities, going from GL/PL index to allele index and vice versa. + // + // ------------------------------------------------------------------------------------- /* * Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index. @@ -300,18 +295,8 @@ public class GenotypeLikelihoods { } } - /** - * The maximum number of alleles that we can represent as genotype likelihoods - */ - public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50; - - /* - * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles - */ - private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); - private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) { - final int numLikelihoods = calculateNumLikelihoods(1+altAlleles, 2); + final int numLikelihoods = numLikelihoods(1 + altAlleles, 2); final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods]; // for all possible combinations of 2 alleles @@ -330,6 +315,32 @@ public class GenotypeLikelihoods { return cache; } + // ------------------------------------------------------------------------------------- + // + // num likelihoods given number of alleles and ploidy + // + // ------------------------------------------------------------------------------------- + + /** + * Actually does the computation in @see #numLikelihoods + * + * @param numAlleles + * @param ploidy + * @return + */ + private static final int calcNumLikelihoods(final int numAlleles, final int ploidy) { + if (numAlleles == 1) + return 1; + else if (ploidy == 1) + return numAlleles; + else { + int acc =0; + for (int k=0; k <= ploidy; k++ ) + acc += calcNumLikelihoods(numAlleles - 1, ploidy - k); + return acc; + } + } + /** * Compute how many likelihood elements are associated with the given number of alleles * Equivalent to asking in how many ways N non-negative integers can add up to P is S(N,P) @@ -344,6 +355,8 @@ public class GenotypeLikelihoods { * which is then, for ordering above, (2,0,0), (1,1,0), (0,2,0), (1,1,0), (0,1,1), (0,0,2) * In general, for P=2 (regular biallelic), then S(N,2) = N*(N+1)/2 * + * Note this method caches the value for most common num Allele / ploidy combinations for efficiency + * * Recursive implementation: * S(N,P) = sum_{k=0}^P S(N-1,P-k) * because if we have N integers, we can condition 1 integer to be = k, and then N-1 integers have to sum to P-K @@ -355,23 +368,16 @@ public class GenotypeLikelihoods { * @param ploidy Ploidy, or number of chromosomes in set * @return Number of likelihood elements we need to hold. */ - public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) { - - // fast, closed form solution for diploid samples (most common use case) - if (ploidy==2) - return numAlleles*(numAlleles+1)/2; - - if (numAlleles == 1) - return 1; - else if (ploidy == 1) - return numAlleles; - - int acc =0; - for (int k=0; k <= ploidy; k++ ) - acc += calculateNumLikelihoods(numAlleles-1, ploidy-k); - - return acc; - + @Requires({"ploidy > 0", "numAlleles > 0"}) + @Ensures("result > 0") + public static int numLikelihoods(final int numAlleles, final int ploidy) { + if ( numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES + && ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY ) + return numLikelihoodCache[numAlleles][ploidy]; + else { + // have to calculate on the fly + return calcNumLikelihoods(numAlleles, ploidy); + } } // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index fc4175735..9577a3e63 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -61,6 +61,11 @@ public class GenotypesContext implements List { */ ArrayList notToBeDirectlyAccessedGenotypes; + /** + * Cached value of the maximum ploidy observed among all samples + */ + private int maxPloidy = -1; + /** Are we allowing users to modify the list? */ boolean immutable = false; @@ -408,6 +413,16 @@ public class GenotypesContext implements List { return getGenotypes().get(i); } + @Ensures("result >= 0") + public int getMaxPloidy() { + if ( maxPloidy == -1 ) { + for ( final Genotype g : getGenotypes() ) { + maxPloidy = Math.max(g.getPloidy(), maxPloidy); + } + } + return maxPloidy; + } + /** * Gets sample associated with this sampleName, or null if none is found * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index b4adc0a8a..dc600d97c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -626,14 +626,13 @@ public class VariantContext implements Feature { // to enable tribble integratio /** * Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes + * + * This function is caching, so it's only expensive on the first call + * * @return -1, or the max ploidy */ public int getMaxPloidy() { - int max = -1; - for ( final Genotype g : getGenotypes() ) { - max = Math.max(g.getPloidy(), max); - } - return max; + return genotypes.getMaxPloidy(); } /** @@ -1381,7 +1380,7 @@ public class VariantContext implements Feature { // to enable tribble integratio && format.getCountType() != VCFHeaderLineCount.UNBOUNDED && format.getType() != VCFHeaderLineType.Flag ) { // we expect exactly the right number of elements final int obsSize = decoded instanceof List ? ((List) decoded).size() : 1; - final int expSize = format.getCount(this.getNAlleles() - 1); + final int expSize = format.getCount(this); if ( obsSize != expSize ) { throw new UserException.MalformedVCFHeader("Discordant field size detected for field " + field + " at " + getChr() + ":" + getStart() + ". Field had " + obsSize + " values " + diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index 83ddd2a1f..01d3ab456 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -159,16 +159,20 @@ public class VariantContextBuilder { return this; } - public VariantContextBuilder alleles(final String ... alleleStrings) { - List alleles = new ArrayList(alleleStrings.length); + public VariantContextBuilder alleles(final List alleleStrings) { + List alleles = new ArrayList(alleleStrings.size()); - for ( int i = 0; i < alleleStrings.length; i++ ) { - alleles.add(Allele.create(alleleStrings[i], i == 0)); + for ( int i = 0; i < alleleStrings.size(); i++ ) { + alleles.add(Allele.create(alleleStrings.get(i), i == 0)); } return alleles(alleles); } + public VariantContextBuilder alleles(final String ... alleleStrings) { + return alleles(Arrays.asList(alleleStrings)); + } + public List getAlleles() { return new ArrayList(alleles); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 223b4509b..b697b3381 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -1199,8 +1199,8 @@ public class VariantContextUtils { altAlleleIndexToUse[i] = true; } - // calculateNumLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, DEFAULT_PLOIDY); + // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(1 + numOriginalAltAlleles, DEFAULT_PLOIDY); for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); // consider this entry only if both of the alleles are good diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java index ecc1cd3e0..812e6dd07 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java @@ -185,7 +185,7 @@ public abstract class BCF2FieldEncoder { @Requires("hasContextDeterminedNumElements()") @Ensures("result >= 0") public int numElements(final VariantContext vc) { - return headerLine.getCount(vc.getNAlleles() - 1); + return headerLine.getCount(vc); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 03a62019c..651223ac3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -402,7 +402,7 @@ class VCFWriter extends IndexingVariantContextWriter { VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); if ( metaData != null ) { - int numInFormatField = metaData.getCount(vc.getAlternateAlleles().size()); + int numInFormatField = metaData.getCount(vc); if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. // For example, if Number=2, the string has to be ".,." diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index abaf23132..69f42e1f9 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -100,10 +100,10 @@ public class GenotypeLikelihoodsUnitTest { for (int nAlleles=2; nAlleles<=5; nAlleles++) // simplest case: diploid - Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2); + Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2); // some special cases: ploidy = 20, #alleles = 4 - Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(4, 20), 1771); + Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(4, 20), 1771); } @Test diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 14d60b6e9..e5b45f70f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -51,6 +51,8 @@ import java.util.*; public class VariantContextTestProvider { final protected static Logger logger = Logger.getLogger(VariantContextTestProvider.class); + final private static boolean ENABLE_GENOTYPE_TESTS = true; + final private static boolean ENABLE_A_AND_G_TESTS = true; final private static boolean ENABLE_VARARRAY_TESTS = true; final private static boolean ENABLE_PLOIDY_TESTS = true; final private static boolean ENABLE_PL_TESTS = true; @@ -180,6 +182,7 @@ public class VariantContextTestProvider { addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.String); addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "ADA", VCFHeaderLineCount.A, VCFHeaderLineType.Integer); addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); @@ -268,9 +271,13 @@ public class VariantContextTestProvider { add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2", "s3"))); add(builder().attribute("VAR.INFO.STRING", null)); - addGenotypesToTestData(); + if ( ENABLE_GENOTYPE_TESTS ) { + addGenotypesToTestData(); + addComplexGenotypesTest(); + } - addComplexGenotypesTest(); + if ( ENABLE_A_AND_G_TESTS ) + addGenotypesAndGTests(); } private static void addGenotypesToTestData() { @@ -315,7 +322,6 @@ public class VariantContextTestProvider { } } - private static void addGenotypes( final VariantContext site) { // test ref/ref final Allele ref = site.getReference(); @@ -516,6 +522,46 @@ public class VariantContextTestProvider { } } + private static void addGenotypesAndGTests() { +// for ( final int ploidy : Arrays.asList(2)) { + for ( final int ploidy : Arrays.asList(1, 2, 3, 4, 5)) { + final List> alleleCombinations = + Arrays.asList( + Arrays.asList("A"), + Arrays.asList("A", "C"), + Arrays.asList("A", "C", "G"), + Arrays.asList("A", "C", "G", "T")); + + for ( final List alleles : alleleCombinations ) { + final VariantContextBuilder vcb = builder().alleles(alleles); + final VariantContext site = vcb.make(); + final int nAlleles = site.getNAlleles(); + final Allele ref = site.getReference(); + + // base genotype is ref/.../ref up to ploidy + final List baseGenotype = new ArrayList(ploidy); + for ( int i = 0; i < ploidy; i++) baseGenotype.add(ref); + final int nPLs = GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy); + + // ada is 0, 1, ..., nAlleles - 1 + final List ada = new ArrayList(nAlleles); + for ( int i = 0; i < nAlleles - 1; i++ ) ada.add(i); + + // pl is 0, 1, ..., up to nPLs (complex calc of nAlleles and ploidy) + final int[] pl = new int[nPLs]; + for ( int i = 0; i < pl.length; i++ ) pl[i] = i; + + final GenotypeBuilder gb = new GenotypeBuilder("ADA_PL_SAMPLE"); + gb.alleles(baseGenotype); + gb.PL(pl); + gb.attribute("ADA", nAlleles == 2 ? ada.get(0) : ada); + vcb.genotypes(gb.make()); + + add(vcb); + } + } + } + private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) { if ( value.length == 0 ) return GenotypeBuilder.create(name, Arrays.asList(ref, ref));