Merged bug fix from Stable into Unstable

This commit is contained in:
Eric Banks 2012-03-16 14:33:53 -04:00
commit be9e48ba29
10 changed files with 288 additions and 237 deletions

View File

@ -42,7 +42,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
if (!vc.isBiallelic()) {
// for non-bliallelic case, do test with most common alt allele.
// Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB.
int[] idxVector = vc.getGLIndecesOfAllele(vc.getAltAlleleWithHighestAlleleCount());
int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount());
idxAA = idxVector[0];
idxAB = idxVector[1];
idxBB = idxVector[2];

View File

@ -64,7 +64,6 @@ public enum DiploidGenotype {
return r != base2;
else
return base2 == r;
//return MathUtils.countOccurrences(r, this.toString()) == 1;
}
public boolean isHom() {

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
@ -61,7 +60,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result, false);
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
return alleles;
}
@ -85,21 +84,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
for ( int i = 0; i < numOriginalAltAlleles; i++ )
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
// make sure that we've cached enough data
if ( numOriginalAltAlleles > UnifiedGenotyperEngine.PLIndexToAlleleIndex.length - 1 )
UnifiedGenotyperEngine.calculatePLcache(numOriginalAltAlleles);
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numOriginalAltAlleles][PLindexOfBestGL];
if ( alleles[0] != 0 )
likelihoodSums[alleles[0]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
if ( alleles.alleleIndex1 != 0 )
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
// don't double-count it
if ( alleles[1] != 0 && alleles[1] != alleles[0] )
likelihoodSums[alleles[1]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
}
}
@ -189,24 +184,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// the column of the matrix
final double[] log10Likelihoods;
// mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column;
// for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB.
final HashMap<ExactACcounts, Integer> ACsetIndexToPLIndex = new HashMap<ExactACcounts, Integer>();
// to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them
final ArrayList<ExactACcounts> dependentACsetsToDelete = new ArrayList<ExactACcounts>();
int sum = -1;
public ExactACset(final int size, final ExactACcounts ACcounts) {
this.ACcounts = ACcounts;
log10Likelihoods = new double[size];
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
}
// sum of all the non-reference alleles
public int getACsum() {
int sum = 0;
for ( int count : ACcounts.getCounts() )
sum += count;
if ( sum == -1 ) {
sum = 0;
for ( int count : ACcounts.getCounts() )
sum += count;
}
return sum;
}
@ -215,15 +207,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
// TODO -- remove me
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result,
final boolean preserveData) {
final boolean foo) {
linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result);
}
// make sure the PL cache has been initialized
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null )
UnifiedGenotyperEngine.calculatePLcache(5);
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
@ -241,21 +239,20 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
// optimization: create the temporary storage for computing L(j,k) just once
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
for ( int i = 0; i < maxPossibleDependencies; i++ )
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
// keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
// adjust max likelihood seen if needed
maxLog10L = Math.max(maxLog10L, log10LofKs);
// clean up memory
indexesToACset.remove(set.ACcounts);
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
}
@ -273,27 +270,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final ArrayList<double[]> genotypeLikelihoods,
final double maxLog10L,
final int numChr,
final boolean preserveData,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result,
final double[][] tempLog10ConformationLikelihoods) {
final AlleleFrequencyCalculationResult result) {
//if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
// clean up memory
if ( !preserveData ) {
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
indexesToACset.remove(index);
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
}
}
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
@ -301,11 +287,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
//if ( DEBUG )
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
// no reason to keep this data around because nothing depends on it
if ( !preserveData )
indexesToACset.remove(set.ACcounts);
return log10LofK;
}
@ -316,15 +297,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final int numAltAlleles = set.ACcounts.getCounts().length;
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
// add conformations for the k+1 case
int PLindex = 0;
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele]++;
updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
// to get to this conformation, a sample would need to be AB (remember that ref=0)
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
@ -338,71 +317,51 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
if ( allele_i == allele_j )
sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
else
differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
}
}
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
for ( DependentSet dependent : sameAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
}
// determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column
if ( !preserveData ) {
final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
if ( lastSet != null )
lastSet.dependentACsetsToDelete.add(set.ACcounts);
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
return log10LofK;
}
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also adds it as a dependency to the given callingSetIndex.
// returns the ExactACset if that set was not already in the queue and null otherwise.
private static void updateACset(final int[] ACcounts,
// also pushes its value to the given callingSetIndex.
private static void updateACset(final int[] newSetCounts,
final int numChr,
final ExactACset callingSet,
final ExactACset dependentSet,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
final ExactACcounts index = new ExactACcounts(ACcounts);
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final ArrayList<double[]> genotypeLikelihoods) {
final ExactACcounts index = new ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, index);
indexesToACset.put(index, set);
ACqueue.add(set);
}
// add the given dependency to the set
// push data from the dependency to the new set
//if ( DEBUG )
// System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts);
final ExactACset set = indexesToACset.get(index);
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
}
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList<ExactACset> ACqueue) {
Iterator<ExactACset> reverseIterator = ACqueue.descendingIterator();
while ( reverseIterator.hasNext() ) {
final ExactACset queued = reverseIterator.next();
if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) )
return queued;
}
// shouldn't get here
throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!");
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
}
private static void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result,
final double[][] tempLog10ConformationLikelihoods) {
final AlleleFrequencyCalculationResult result) {
set.log10Likelihoods[0] = 0.0; // the zero case
final int totalK = set.getACsum();
@ -414,42 +373,18 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
// k > 0 for at least one k
else {
// deal with the non-AA possible conformations
int conformationIndex = 1;
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
//if ( DEBUG )
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
ExactACset dependent = indexesToACset.get(mapping.getKey());
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j);
tempLog10ConformationLikelihoods[j][conformationIndex] =
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
} else {
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
}
}
conformationIndex++;
}
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
// the non-AA possible conformations were dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
} else {
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
}
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths);
set.log10Likelihoods[j] = log10Max - logDenominator;
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
}
}
@ -478,6 +413,23 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
private static void pushData(final ExactACset targetSet,
final ExactACset dependentSet,
final int PLsetIndex,
final ArrayList<double[]> genotypeLikelihoods) {
final int totalK = targetSet.getACsum();
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue =
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
}
}
}
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// the closed form representation generalized for multiple alleles is as follows:
@ -488,25 +440,26 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// BC: 2 * k_b * k_c
// CC: k_c * (k_c - 1)
final int numAltAlleles = ACcounts.length;
// find the 2 alleles that are represented by this PL index
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
// the AX het case
if ( PLindex <= numAltAlleles )
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
if ( alleles.alleleIndex1 == 0 )
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
// find the 2 alternate alleles that are represented by this PL index
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex];
final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele
final int k_i = ACcounts[alleles.alleleIndex1-1];
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( alleles[0] == alleles[1] ) {
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[alleles[1]-1];
final int k_j = ACcounts[alleles.alleleIndex2-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}

View File

@ -56,10 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
// make sure the PL cache has been initialized with enough alleles
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles
UnifiedGenotyperEngine.calculatePLcache(3);
}
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
@ -136,6 +132,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
}
builder.alleles(alleles);
// create the PL ordering to use based on the allele ordering.
final int[] PLordering = new int[numLikelihoods];
for ( int i = 0; i <= numAltAlleles; i++ ) {
for ( int j = i; j <= numAltAlleles; j++ ) {
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
// In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc."
PLordering[(j * (j+1) / 2) + i] = DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal();
}
}
// create the genotypes; no-call everyone for now
final GenotypesContext genotypes = GenotypesContext.create();
final List<Allele> noCall = new ArrayList<Allele>();
@ -145,12 +151,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
final double[] myLikelihoods = new double[numLikelihoods];
int myLikelihoodsIndex = 0;
for ( int i = 0; i <= numAltAlleles; i++ ) {
for ( int j = i; j <= numAltAlleles; j++ ) {
myLikelihoods[myLikelihoodsIndex++] = allLikelihoods[DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal()];
}
}
for ( int i = 0; i < numLikelihoods; i++ )
myLikelihoods[i] = allLikelihoods[PLordering[i]];
// normalize in log space so that max element is zero.
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
@ -177,12 +179,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final double[] likelihoods = sampleData.GL.getLikelihoods();
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
if ( PLindexOfBestGL != PLindexOfRef ) {
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL];
if ( alleles[0] != baseIndexOfRef )
likelihoodSums[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePairUsingDeprecatedOrdering(PLindexOfBestGL);
if ( alleles.alleleIndex1 != baseIndexOfRef )
likelihoodSums[alleles.alleleIndex1] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
// don't double-count it
if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] )
likelihoodSums[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
if ( alleles.alleleIndex2 != baseIndexOfRef && alleles.alleleIndex2 != alleles.alleleIndex1 )
likelihoodSums[alleles.alleleIndex2] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
}
}

View File

@ -104,10 +104,6 @@ public class UnifiedGenotyperEngine {
private final GenomeLocParser genomeLocParser;
private final boolean BAQEnabledOnCMDLine;
// a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
// the representation is int[number of alternate alleles][PL index][pair of allele indexes (where reference = 0)]
protected static int[][][] PLIndexToAlleleIndex;
// ---------------------------------------------------------------------------------------------------------
//
@ -140,27 +136,6 @@ public class UnifiedGenotyperEngine {
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
filter.add(LOW_QUAL_FILTER_NAME);
calculatePLcache(UAC.MAX_ALTERNATE_ALLELES);
}
protected static void calculatePLcache(int maxAltAlleles) {
PLIndexToAlleleIndex = new int[maxAltAlleles+1][][];
PLIndexToAlleleIndex[0] = new int[][]{ new int[]{0, 0} };
int numLikelihoods = 1;
// for each count of alternate alleles
for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) {
numLikelihoods += altAlleles + 1;
PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][];
int PLindex = 0;
// for all possible combinations of the 2 alt alleles
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 };
}
}
}
}
/**
@ -794,21 +769,17 @@ public class UnifiedGenotyperEngine {
if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) {
likelihoodIndexesToUse = new ArrayList<Integer>(30);
// make sure that we've cached enough data
if ( numOriginalAltAlleles > PLIndexToAlleleIndex.length - 1 )
calculatePLcache(numOriginalAltAlleles);
final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles];
final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles];
for ( int i = 0; i < numOriginalAltAlleles; i++ ) {
if ( allelesToUse.contains(vc.getAlternateAllele(i)) )
altAlleleIndexToUse[i] = true;
}
for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) {
final int[] alleles = PLcache[PLindex];
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles);
for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// consider this entry only if both of the alleles are good
if ( (alleles[0] == 0 || altAlleleIndexToUse[alleles[0] - 1]) && (alleles[1] == 0 || altAlleleIndexToUse[alleles[1] - 1]) )
if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) )
likelihoodIndexesToUse.add(PLindex);
}
}
@ -861,11 +832,11 @@ public class UnifiedGenotyperEngine {
protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List<Allele> allelesToUse, final int numNewAltAlleles, final Map<String, Object> attrs) {
// find the genotype with maximum likelihoods
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex];
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
myAlleles.add(allelesToUse.get(alleles[0]));
myAlleles.add(allelesToUse.get(alleles[1]));
myAlleles.add(allelesToUse.get(alleles.alleleIndex1));
myAlleles.add(allelesToUse.get(alleles.alleleIndex2));
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false);

View File

@ -25,13 +25,10 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.jgrapht.util.MathUtil;
import java.util.EnumMap;
import java.util.Map;
public class GenotypeLikelihoods {
public static final boolean CAP_PLS = false;
@ -201,4 +198,116 @@ public class GenotypeLikelihoods {
return s.toString();
}
// -------------------------------------------------------------------------------------
//
// Static conversion utilities, going from GL/PL index to allele index and vice versa.
//
// -------------------------------------------------------------------------------------
/*
* Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index.
* Note that the reference allele is always index=0.
*/
public static class GenotypeLikelihoodsAllelePair {
public final int alleleIndex1, alleleIndex2;
public GenotypeLikelihoodsAllelePair(final int alleleIndex1, final int alleleIndex2) {
this.alleleIndex1 = alleleIndex1;
this.alleleIndex2 = alleleIndex2;
}
}
/*
* a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
*/
private static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = new GenotypeLikelihoodsAllelePair[]{ new GenotypeLikelihoodsAllelePair(0, 0) };
private static void calculatePLcache(final int minIndex) {
// how many alternate alleles do we need to calculate for?
int altAlleles = 0;
int numLikelihoods = 1;
while ( numLikelihoods <= minIndex ) {
altAlleles++;
numLikelihoods += altAlleles + 1;
}
PLIndexToAlleleIndex = new GenotypeLikelihoodsAllelePair[numLikelihoods];
// for all possible combinations of 2 alleles
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
PLIndexToAlleleIndex[calculatePLindex(allele1, allele2)] = new GenotypeLikelihoodsAllelePair(allele1, allele2);
}
}
}
// how many likelihoods are associated with the given number of alternate alleles?
public static int calculateNumLikelihoods(int numAltAlleles) {
int numLikelihoods = 1;
for ( int i = 1; i <= numAltAlleles; i++ )
numLikelihoods += i + 1;
return numLikelihoods;
}
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
// In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc."
// Assumes that allele1Index < allele2Index
public static int calculatePLindex(final int allele1Index, final int allele2Index) {
return (allele2Index * (allele2Index+1) / 2) + allele1Index;
}
/**
* get the allele index pair for the given PL
*
* @param PLindex the PL index
* @return the allele index pair
*/
public static GenotypeLikelihoodsAllelePair getAllelePair(final int PLindex) {
// make sure that we've cached enough data
if ( PLindex >= PLIndexToAlleleIndex.length )
calculatePLcache(PLindex);
return PLIndexToAlleleIndex[PLindex];
}
// An index conversion from the deprecated PL ordering to the new VCF-based ordering for up to 3 alternate alleles
protected static int[] PLindexConversion = new int[]{0, 1, 3, 6, 2, 4, 7, 5, 8, 9};
/**
* get the allele index pair for the given PL using the deprecated PL ordering:
* AA,AB,AC,AD,BB,BC,BD,CC,CD,DD instead of AA,AB,BB,AC,BC,CC,AD,BD,CD,DD.
* Although it's painful to keep this conversion around, our DiploidSNPGenotypeLikelihoods class uses the deprecated
* ordering and I know with certainty that external users have built code on top of it; changing it now would
* cause a whole lot of heartache for our collaborators, so for now at least there's a standard conversion method.
* This method assumes at most 3 alternate alleles.
* TODO -- address this issue at the source by updating DiploidSNPGenotypeLikelihoods.
*
* @param PLindex the PL index
* @return the allele index pair
*/
public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) {
// make sure that we've cached enough data
if ( PLindex >= PLIndexToAlleleIndex.length )
calculatePLcache(PLindex);
return PLIndexToAlleleIndex[PLindexConversion[PLindex]];
}
/**
* get the PL indexes (AA, AB, BB) for the given allele pair; assumes allele1Index <= allele2Index.
*
* @param allele1Index the index in VariantContext.getAllele() of the first allele
* @param allele2Index the index in VariantContext.getAllele() of the second allele
* @return the PL indexes
*/
public static int[] getPLIndecesOfAlleles(final int allele1Index, final int allele2Index) {
final int[] indexes = new int[3];
indexes[0] = calculatePLindex(allele1Index, allele1Index);
indexes[1] = calculatePLindex(allele1Index, allele2Index);
indexes[2] = calculatePLindex(allele2Index, allele2Index);
return indexes;
}
}

View File

@ -1255,40 +1255,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati
return best;
}
public int[] getGLIndecesOfAllele(Allele inputAllele) {
public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) {
// TODO -- this information is cached statically by the UnifiedGenotyperEngine; pull it out into a common utils class for all to use
int[] idxVector = new int[3];
int numAlleles = this.getAlleles().size();
int idxDiag = numAlleles;
int incr = numAlleles - 1;
int k=1;
for (Allele a: getAlternateAlleles()) {
// multi-allelic approximation, part 1: Ideally
// for each alt allele compute marginal (suboptimal) posteriors -
// compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of
// as a row-wise upper triangular matrix of likelihoods.
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
int idxAA = 0;
int idxAB = k++;
// yy is always element on the diagonal.
// 2 alleles: BBelement 2
// 3 alleles: BB element 3. CC element 5
// 4 alleles:
int idxBB = idxDiag;
if (a.equals(inputAllele)) {
idxVector[0] = idxAA;
idxVector[1] = idxAB;
idxVector[2] = idxBB;
int index = 1;
for ( Allele allele : getAlternateAlleles() ) {
if ( allele.equals(targetAllele) )
break;
}
idxDiag += incr--;
index++;
}
return idxVector;
return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index);
}
}

View File

@ -27,8 +27,8 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
BB1 = new double[]{-20.0, -20.0, 0.0};
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
AC2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
BB2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
}

View File

@ -5,8 +5,10 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
// ********************************************************************************** //
@ -60,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
Arrays.asList("5af005255240a2186f04cb50851b8b6f"));
Arrays.asList("0de4aeed6a52f08ed86a7642c812478b"));
executeTest("test Multiple SNP alleles", spec);
}
@ -277,42 +279,53 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testWithIndelAllelesPassedIn1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("9cd08dc412a007933381e9c76c073899"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1);
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@Test
public void testWithIndelAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("5ef1f007d3ef77c1b8f31e5e036eff53"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@Test
public void testWithIndelAllelesPassedIn3() {
public void testMultiSampleIndels() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("52340d578a708fa709b69ce48987bc9d"));
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
Arrays.asList("2609675a356f2dfc86f8a1d911210978"));
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("9566c7abef5ee5829a516d90445b347f"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@Test
public void testWithIndelAllelesPassedIn4() {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
Arrays.asList("4fdd8da77167881b71b3547da5c13f94"));
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
public void testGGAwithNoEvidenceInReads() {
final String vcf = "small.indel.test.vcf";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + vcf + " -I " + validationDataLocation +
"NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1,
Arrays.asList("7d069596597aee5e0d562964036141eb"));
executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing SnpEff
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testWithIndelAllelesPassedIn5() {
final String vcf = "small.indel.test.vcf";

View File

@ -117,7 +117,7 @@ public class GenotypeLikelihoodsUnitTest {
}
@Test
public void testgetQualFromLikelihoods(){
public void testgetQualFromLikelihoods() {
double[] likelihoods = new double[]{-1, 0, -2};
// qual values we expect for each possible "best" genotype
double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294};
@ -134,4 +134,33 @@ public class GenotypeLikelihoodsUnitTest {
Assert.assertEquals(v1[i], v2[i], 1e-6);
}
}
@Test
public void testCalculatePLindex(){
int counter = 0;
for ( int i = 0; i <= 3; i++ ) {
for ( int j = i; j <= 3; j++ ) {
Assert.assertEquals(GenotypeLikelihoods.calculatePLindex(i, j), GenotypeLikelihoods.PLindexConversion[counter++], "PL index of alleles " + i + "," + j + " was not calculated correctly");
}
}
}
@Test
public void testGetAllelePair(){
allelePairTest(0, 0, 0);
allelePairTest(1, 0, 1);
allelePairTest(2, 1, 1);
allelePairTest(3, 0, 2);
allelePairTest(4, 1, 2);
allelePairTest(5, 2, 2);
allelePairTest(6, 0, 3);
allelePairTest(7, 1, 3);
allelePairTest(8, 2, 3);
allelePairTest(9, 3, 3);
}
private void allelePairTest(int PLindex, int allele1, int allele2) {
Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex1, allele1, "allele index " + allele1 + " from PL index " + PLindex + " was not calculated correctly");
Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex2, allele2, "allele index " + allele2 + " from PL index " + PLindex + " was not calculated correctly");
}
}