Initial commit of new likelihood model to evaluate indel quality. Principle is simple, a plain Pair HMM with affine gap penalties (in log space) that does quasi-local alignment between reads and candidate haplotypes and which in theory should be more solid and more reliable than the older Dindel-based model. It also allows to be easily extensible in the future if we decide to introduce either context-dependent and/or read-dependent gap penalties.
Model is disabled by default and we're still using the old Dindel model until I'm more confident that new model is a definitive improvement, so right now this is enabled by hidden command line arguments, and it's not to be used yet. In detail: a) Several refactorings to share softMax() available to other modules, so its now part of MathUtils. b) Refactored a couple of read utilities and moved from BAQ to ReadUtils. c) New PairHMMIndelErrorModel class implementing new likelihood model d) Several new hidden debug arguments in UAC. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5389 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
96fe540d66
commit
8c262eb605
|
|
@ -30,7 +30,10 @@ import org.apache.log4j.Logger;
|
|||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
|
|
@ -46,25 +49,39 @@ import java.util.*;
|
|||
|
||||
public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
private final int maxReadDeletionLength = 3;
|
||||
private final int HAPLOTYPE_SIZE = 80;
|
||||
private final int HAPLOTYPE_SIZE;
|
||||
|
||||
private final int minIndelCountForGenotyping;
|
||||
private final boolean getAlleleListFromVCF;
|
||||
// todo - the following need to be exposed for command line argument control
|
||||
private final double indelHeterozygosity = 1.0/8000;
|
||||
boolean useFlatPriors = true;
|
||||
private static final boolean DEBUGOUT = false;
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
private boolean DEBUG = false;
|
||||
|
||||
// todo -cleanup
|
||||
private HaplotypeIndelErrorModel model;
|
||||
private PairHMMIndelErrorModel pairModel;
|
||||
private boolean newLike = false;
|
||||
|
||||
private GenomeLoc lastSiteVisited;
|
||||
private List<Allele> alleleList;
|
||||
|
||||
protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
model = new HaplotypeIndelErrorModel(maxReadDeletionLength, UAC.INSERTION_START_PROBABILITY,
|
||||
UAC.INSERTION_END_PROBABILITY, UAC.ALPHA_DELETION_PROBABILITY, HAPLOTYPE_SIZE, false, DEBUGOUT);
|
||||
if (UAC.newlike) {
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.S1, UAC.S2, UAC.dovit);
|
||||
newLike = true;
|
||||
}
|
||||
else
|
||||
model = new HaplotypeIndelErrorModel(maxReadDeletionLength, UAC.INSERTION_START_PROBABILITY,
|
||||
UAC.INSERTION_END_PROBABILITY,UAC.ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
|
||||
alleleList = new ArrayList<Allele>();
|
||||
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
|
||||
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -300,11 +317,15 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
if (eventLength<0)
|
||||
eventLength = - eventLength;
|
||||
|
||||
int currentHaplotypeSize = HAPLOTYPE_SIZE;
|
||||
|
||||
// int numSamples = getNSamples(contexts);
|
||||
List<Haplotype> haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
||||
ref, currentHaplotypeSize);
|
||||
List<Haplotype> haplotypesInVC;
|
||||
if (newLike)
|
||||
haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
||||
ref, HAPLOTYPE_SIZE, HAPLOTYPE_SIZE/2);
|
||||
else
|
||||
haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
||||
ref, HAPLOTYPE_SIZE, 20);
|
||||
|
||||
// For each sample, get genotype likelihoods based on pileup
|
||||
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
||||
// initialize the GenotypeLikelihoods
|
||||
|
|
@ -316,7 +337,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
priors = new DiploidIndelGenotypePriors();
|
||||
}
|
||||
else
|
||||
priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,currentHaplotypeSize);
|
||||
priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,HAPLOTYPE_SIZE);
|
||||
|
||||
//double[] priorLikelihoods = priors.getPriors();
|
||||
|
||||
|
|
@ -330,7 +351,11 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
pileup = context.getBasePileup();
|
||||
|
||||
if (pileup != null ) {
|
||||
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);
|
||||
|
||||
if (newLike)
|
||||
haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref);
|
||||
else
|
||||
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);
|
||||
|
||||
|
||||
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);
|
||||
|
|
@ -342,7 +367,8 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
genotypeLikelihoods[1],
|
||||
genotypeLikelihoods[2],
|
||||
getFilteredDepth(pileup)));
|
||||
//System.out.format("%4.2f %4.2f %4.2f\n",genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]);
|
||||
if (DEBUG)
|
||||
System.out.format("Sample:%s GL:%4.2f %4.2f %4.2f\n",sample.getKey(), genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -57,29 +57,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
private boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
private static final double[] log10Cache;
|
||||
private static final double[] jacobianLogTable;
|
||||
// private static final int JACOBIAN_LOG_TABLE_SIZE = 100001;
|
||||
// private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001;
|
||||
private static final int JACOBIAN_LOG_TABLE_SIZE = 101;
|
||||
private static final double JACOBIAN_LOG_TABLE_STEP = 0.1;
|
||||
private static final double MAX_JACOBIAN_TOLERANCE = 10.0;
|
||||
private static final int MAXN = 10000; // todo -- warning, this might be hit at some point...
|
||||
|
||||
static {
|
||||
log10Cache = new double[2*MAXN];
|
||||
jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE];
|
||||
|
||||
log10Cache[0] = Double.NEGATIVE_INFINITY;
|
||||
for (int k=1; k < 2*MAXN; k++)
|
||||
log10Cache[k] = Math.log10(k);
|
||||
|
||||
for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
|
||||
jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) * JACOBIAN_LOG_TABLE_STEP));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
final private ExactCalculation calcToUse;
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
|
|
@ -314,19 +293,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
final double[] gl = genotypeLikelihoods[j];
|
||||
final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
double aa = Double.NEGATIVE_INFINITY;
|
||||
double ab = Double.NEGATIVE_INFINITY;
|
||||
if (k < 2*j-1)
|
||||
aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
|
||||
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
|
||||
|
||||
if (k < 2*j)
|
||||
ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
|
||||
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
|
||||
|
||||
double log10Max;
|
||||
if (k > 1) {
|
||||
final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
|
||||
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
|
||||
log10Max = approximateLog10SumLog10(aa, ab, bb);
|
||||
} else {
|
||||
// we know we aren't considering the BB case, so we can use an optimized log10 function
|
||||
|
|
@ -372,7 +351,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY )
|
||||
return big;
|
||||
|
||||
if (big >= small + MAX_JACOBIAN_TOLERANCE)
|
||||
if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE)
|
||||
return big;
|
||||
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
|
|
@ -383,14 +362,15 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// with integer quantization
|
||||
// we have pre-stored correction for 0,0.1,0.2,... 10.0
|
||||
//final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
|
||||
int ind = (int)(Math.round((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
|
||||
int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding
|
||||
|
||||
//double z =Math.log10(1+Math.pow(10.0,-diff));
|
||||
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
|
||||
return big + jacobianLogTable[ind];
|
||||
return big + MathUtils.jacobianLogTable[ind];
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Can be overridden by concrete subclasses
|
||||
* @param vc variant context with genotype likelihoods
|
||||
|
|
@ -549,7 +529,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
|
||||
double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
|
||||
double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
|
||||
double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
// special treatment for k=0: iteration reduces to:
|
||||
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
|
|
@ -561,25 +541,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
double logNumerator[];
|
||||
logNumerator = new double[3];
|
||||
if (k < 2*j-1)
|
||||
logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
|
||||
logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
|
||||
genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
else
|
||||
logNumerator[0] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
|
||||
if (k < 2*j)
|
||||
logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
|
||||
logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
|
||||
genotypeLikelihoods[GenotypeType.AB.ordinal()];
|
||||
else
|
||||
logNumerator[1] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
if (k > 1)
|
||||
logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] +
|
||||
logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] +
|
||||
genotypeLikelihoods[GenotypeType.BB.ordinal()];
|
||||
else
|
||||
logNumerator[2] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
double logNum = softMax(logNumerator);
|
||||
double logNum = MathUtils.softMax(logNumerator);
|
||||
|
||||
//YMatrix[j][k] = num/den;
|
||||
logYMatrix[j][k] = logNum - logDenominator;
|
||||
|
|
@ -602,47 +582,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
double softMax(double[] vec) {
|
||||
// compute naively log10(10^x[0] + 10^x[1]+...)
|
||||
// return Math.log10(MathUtils.sumLog10(vec));
|
||||
|
||||
// better approximation: do Jacobian logarithm function on data pairs
|
||||
double a = softMaxPair(vec[0],vec[1]);
|
||||
return softMaxPair(a,vec[2]);
|
||||
}
|
||||
|
||||
static public double softMaxPair(double x, double y) {
|
||||
if (Double.isInfinite(x))
|
||||
return y;
|
||||
|
||||
if (Double.isInfinite(y))
|
||||
return x;
|
||||
|
||||
if (y >= x + MAX_JACOBIAN_TOLERANCE)
|
||||
return y;
|
||||
if (x >= y + MAX_JACOBIAN_TOLERANCE)
|
||||
return x;
|
||||
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
// we need to compute log10(10^x + 10^y)
|
||||
// By Jacobian logarithm identity, this is equal to
|
||||
// max(x,y) + log10(1+10^-abs(x-y))
|
||||
// we compute the second term as a table lookup
|
||||
// with integer quantization
|
||||
double diff = Math.abs(x-y);
|
||||
double t1 =x;
|
||||
if (y > x)
|
||||
t1 = y;
|
||||
// t has max(x,y)
|
||||
// we have pre-stored correction for 0,0.1,0.2,... 10.0
|
||||
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
|
||||
double t2 = jacobianLogTable[ind];
|
||||
|
||||
// gdebug+
|
||||
//double z =Math.log10(1+Math.pow(10.0,-diff));
|
||||
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
|
||||
//gdebug-
|
||||
return t1+t2;
|
||||
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -95,6 +95,7 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
|
||||
public double INDEL_HETEROZYGOSITY = 1.0/8000;
|
||||
|
||||
// todo - remove these 3 arguments
|
||||
@Argument(fullName = "insertionStartProbability", shortName = "insertionStartProbability", doc = "Heterozygosity for indel calling", required = false)
|
||||
public double INSERTION_START_PROBABILITY = 1e-3;
|
||||
|
||||
|
|
@ -104,6 +105,33 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "alphaDeletionProbability", shortName = "alphaDeletionProbability", doc = "Heterozygosity for indel calling", required = false)
|
||||
public double ALPHA_DELETION_PROBABILITY = 1e-3;
|
||||
|
||||
|
||||
@Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty", required = false)
|
||||
public double INDEL_GAP_CONTINUATION_PENALTY = 10.0;
|
||||
|
||||
@Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty", required = false)
|
||||
public double INDEL_GAP_OPEN_PENALTY = 40.0;
|
||||
|
||||
@Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false)
|
||||
public int INDEL_HAPLOTYPE_SIZE = 80;
|
||||
//gdebug+
|
||||
@Hidden
|
||||
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
|
||||
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "s1", shortName = "s1", doc = "Output indel debug info", required = false)
|
||||
public String S1 = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "s2", shortName = "s2", doc = "Output indel debug info", required = false)
|
||||
public String S2 = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false)
|
||||
public boolean dovit = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "newlike", shortName = "newlike", doc = "Output indel debug info", required = false)
|
||||
public boolean newlike = false;
|
||||
//gdebug-
|
||||
|
||||
@Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false)
|
||||
public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL;
|
||||
|
||||
|
|
@ -135,13 +163,20 @@ public class UnifiedArgumentCollection {
|
|||
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
|
||||
uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY;
|
||||
uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY;
|
||||
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;
|
||||
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
|
||||
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
|
||||
|
||||
// todo- arguments to remove
|
||||
uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT;
|
||||
uac.INSERTION_START_PROBABILITY = INSERTION_START_PROBABILITY;
|
||||
uac.INSERTION_END_PROBABILITY = INSERTION_END_PROBABILITY;
|
||||
uac.ALPHA_DELETION_PROBABILITY = ALPHA_DELETION_PROBABILITY;
|
||||
uac.EXACT_CALCULATION_TYPE = EXACT_CALCULATION_TYPE;
|
||||
|
||||
uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT;
|
||||
|
||||
uac.S1 = S1;
|
||||
uac.S2 = S2;
|
||||
uac.dovit = dovit;
|
||||
|
||||
return uac;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -453,7 +453,7 @@ public class HaplotypeIndelErrorModel {
|
|||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
// Second term is a constant added to both likelihoods so will be ignored
|
||||
haplotypeLikehoodMatrix[i][j] += ExactAFCalculationModel.softMaxPair(readLikelihood[0],
|
||||
haplotypeLikehoodMatrix[i][j] += MathUtils.softMax(readLikelihood[0],
|
||||
readLikelihood[1]);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,773 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class PairHMMIndelErrorModel {
|
||||
|
||||
|
||||
private final int BASE_QUAL_THRESHOLD = 4;
|
||||
|
||||
|
||||
private static final int MATCH_OFFSET = 0;
|
||||
private static final int X_OFFSET = 1;
|
||||
private static final int Y_OFFSET = 2;
|
||||
|
||||
private static final int DIAG = 0;
|
||||
private static final int UP = 1;
|
||||
private static final int LEFT = 2;
|
||||
|
||||
private static final int DIAG_GOTO_M = 0;
|
||||
private static final int DIAG_GOTO_X = 1;
|
||||
private static final int DIAG_GOTO_Y = 2;
|
||||
|
||||
private static final int UP_GOTO_M = 4;
|
||||
private static final int UP_GOTO_X = 5;
|
||||
private static final int UP_GOTO_Y = 6;
|
||||
|
||||
private static final int LEFT_GOTO_M = 8;
|
||||
private static final int LEFT_GOTO_X = 9;
|
||||
private static final int LEFT_GOTO_Y = 10;
|
||||
|
||||
private static final int[] ACTIONS_M = {DIAG_GOTO_M, DIAG_GOTO_X, DIAG_GOTO_Y};
|
||||
private static final int[] ACTIONS_X = {UP_GOTO_M, UP_GOTO_X, UP_GOTO_Y};
|
||||
private static final int[] ACTIONS_Y = {LEFT_GOTO_M, LEFT_GOTO_X, LEFT_GOTO_Y};
|
||||
|
||||
|
||||
private final double logGapOpenProbability;
|
||||
private final double logGapContinuationProbability;
|
||||
|
||||
private boolean DEBUG = false;
|
||||
|
||||
private static final int MAX_CACHED_QUAL = 127;
|
||||
|
||||
private static final double baseMatchArray[];
|
||||
private static final double baseMismatchArray[];
|
||||
|
||||
private final static double LOG_ONE_THIRD;
|
||||
private final static double LOG_ONE_HALF;
|
||||
/*
|
||||
private final double c;
|
||||
private final double d;
|
||||
private final double e;
|
||||
private final double oneMinusTwoEps;
|
||||
*/
|
||||
private final int trailingBases = 3;
|
||||
|
||||
private boolean doViterbi = false;
|
||||
private final boolean useSoftClippedBases = false;
|
||||
private final boolean useAffineGapModel = true;
|
||||
|
||||
private String s1;
|
||||
private String s2;
|
||||
|
||||
// private BAQ baq;
|
||||
|
||||
static {
|
||||
LOG_ONE_THIRD= -Math.log10(3.0);
|
||||
LOG_ONE_HALF= -Math.log10(2.0);
|
||||
final double OFFSET = -Math.log10(0.25);
|
||||
|
||||
baseMatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
|
||||
double baseProb = Math.pow(10, -k/10.);
|
||||
|
||||
|
||||
baseMatchArray[k] = Math.log10(1-baseProb)+ OFFSET;
|
||||
baseMismatchArray[k] = Math.log10(baseProb)+LOG_ONE_THIRD+OFFSET;
|
||||
}
|
||||
}
|
||||
|
||||
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, String s1, String s2, boolean dovit) {
|
||||
this(indelGOP, indelGCP, deb);
|
||||
this.s1 = s1;
|
||||
this.s2 = s2;
|
||||
this.doViterbi = dovit;
|
||||
}
|
||||
|
||||
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb) {
|
||||
|
||||
|
||||
this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
|
||||
this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob
|
||||
|
||||
this.DEBUG = deb;
|
||||
|
||||
|
||||
// baq = new BAQ(Math.pow(10.0,logGapOpenProbability),Math.pow(10.0,logGapContinuationProbability),
|
||||
// 3,(byte)BASE_QUAL_THRESHOLD,true);
|
||||
|
||||
}
|
||||
|
||||
public double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) {
|
||||
final int X_METRIC_LENGTH = readBases.length+1;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
|
||||
|
||||
// initialize path metric and traceback memories for likelihood computation
|
||||
double[][] pathMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
int[][] bestMetricArray = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
pathMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (int i=1; i < X_METRIC_LENGTH; i++) {
|
||||
pathMetricArray[i][0] = 0;
|
||||
bestMetricArray[i][0] = UP;
|
||||
}
|
||||
|
||||
for (int j=1; j < Y_METRIC_LENGTH; j++) {
|
||||
pathMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability;
|
||||
bestMetricArray[0][j] = LEFT;
|
||||
}
|
||||
|
||||
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
|
||||
for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) {
|
||||
|
||||
byte x = readBases[indI-1];
|
||||
byte y = haplotypeBases[indJ-1];
|
||||
byte qual = readQuals[indI-1];
|
||||
|
||||
double bestMetric = 0.0;
|
||||
int bestMetricIdx = 0;
|
||||
|
||||
// compute metric for match/mismatch
|
||||
// workaround for reads whose bases quality = 0,
|
||||
if (qual < 1)
|
||||
qual = 1;
|
||||
|
||||
if (qual > MAX_CACHED_QUAL)
|
||||
qual = MAX_CACHED_QUAL;
|
||||
|
||||
double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
|
||||
double[] metrics = new double[3];
|
||||
|
||||
metrics[DIAG] = pathMetricArray[indI-1][indJ-1] + pBaseRead;
|
||||
metrics[UP] = pathMetricArray[indI-1][indJ] + logGapOpenProbability;//(end?0.0:logGapOpenProbability);
|
||||
metrics[LEFT] = pathMetricArray[indI][indJ-1] + logGapOpenProbability;//(end?0.0:logGapOpenProbability);
|
||||
|
||||
if (doViterbi) {
|
||||
bestMetricIdx = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestMetricIdx];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(metrics);
|
||||
|
||||
pathMetricArray[indI][indJ] = bestMetric;
|
||||
bestMetricArray[indI][indJ] = bestMetricIdx;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
double bestMetric=0.0;
|
||||
int bestMetricIdx=0,bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1;
|
||||
|
||||
for (int i=0; i < X_METRIC_LENGTH; i ++ ) {
|
||||
int j= Y_METRIC_LENGTH-1;
|
||||
|
||||
if (pathMetricArray[i][j] > bestMetric) {
|
||||
bestMetric = pathMetricArray[i][j];
|
||||
bestI = i;
|
||||
bestJ = j;
|
||||
}
|
||||
}
|
||||
for (int j=0; j < Y_METRIC_LENGTH; j++ ) {
|
||||
int i= X_METRIC_LENGTH-1;
|
||||
if (pathMetricArray[i][j] >= bestMetric) {
|
||||
bestMetric = pathMetricArray[i][j];
|
||||
bestI = i;
|
||||
bestJ = j;
|
||||
}
|
||||
}
|
||||
|
||||
if (DEBUG && doViterbi) {
|
||||
|
||||
String haplotypeString = new String (haplotypeBases);
|
||||
String readString = new String(readBases);
|
||||
|
||||
|
||||
int i = bestI;
|
||||
int j = bestJ;
|
||||
|
||||
|
||||
System.out.println("Simple NW");
|
||||
|
||||
while (i >0 || j >0) {
|
||||
bestMetricIdx = bestMetricArray[i][j];
|
||||
System.out.print(bestMetricIdx);
|
||||
if (bestMetricIdx == UP) {
|
||||
// insert gap in Y
|
||||
haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j);
|
||||
i--;
|
||||
} else if (bestMetricIdx == LEFT) {
|
||||
readString = readString.substring(0,i)+"-"+readString.substring(i);
|
||||
j--;
|
||||
}
|
||||
else {
|
||||
i--; j--;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
System.out.println("\nAlignment: ");
|
||||
System.out.println("R:"+readString);
|
||||
System.out.println("H:"+haplotypeString);
|
||||
System.out.println();
|
||||
|
||||
|
||||
}
|
||||
if (DEBUG)
|
||||
System.out.format("Likelihood: %5.4f\n", bestMetric);
|
||||
|
||||
return bestMetric;
|
||||
|
||||
|
||||
}
|
||||
|
||||
public double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) {
|
||||
|
||||
final int X_METRIC_LENGTH = readBases.length+1;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
|
||||
|
||||
// initialize path metric and traceback memories for likelihood computation
|
||||
double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
int[][] bestActionArrayM = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
int[][] bestActionArrayX = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
int[][] bestActionArrayY = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
matchMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (int i=1; i < X_METRIC_LENGTH; i++) {
|
||||
//initialize first column
|
||||
matchMetricArray[i][0] = Double.NEGATIVE_INFINITY;
|
||||
YMetricArray[i][0] = Double.NEGATIVE_INFINITY;
|
||||
XMetricArray[i][0] = 0;//logGapOpenProbability + (i-1)*logGapContinuationProbability;
|
||||
|
||||
bestActionArrayX[i][0] = bestActionArrayY[i][0] = bestActionArrayM[i][0] = UP_GOTO_X;
|
||||
}
|
||||
|
||||
for (int j=1; j < Y_METRIC_LENGTH; j++) {
|
||||
// initialize first row
|
||||
matchMetricArray[0][j] = Double.NEGATIVE_INFINITY;
|
||||
XMetricArray[0][j] = Double.NEGATIVE_INFINITY;
|
||||
YMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability;
|
||||
|
||||
bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y;
|
||||
}
|
||||
|
||||
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
|
||||
for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) {
|
||||
|
||||
byte x = readBases[indI-1];
|
||||
byte y = haplotypeBases[indJ-1];
|
||||
byte qual = readQuals[indI-1];
|
||||
|
||||
double bestMetric = 0.0;
|
||||
int bestMetricIdx = 0;
|
||||
|
||||
// compute metric for match/mismatch
|
||||
// workaround for reads whose bases quality = 0,
|
||||
if (qual < 1)
|
||||
qual = 1;
|
||||
|
||||
if (qual > MAX_CACHED_QUAL)
|
||||
qual = MAX_CACHED_QUAL;
|
||||
|
||||
double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
|
||||
|
||||
|
||||
double c,d;
|
||||
double[] metrics = new double[3];
|
||||
|
||||
// update match array
|
||||
metrics[MATCH_OFFSET] = matchMetricArray[indI-1][indJ-1] + pBaseRead;
|
||||
metrics[X_OFFSET] = XMetricArray[indI-1][indJ-1] + pBaseRead;
|
||||
metrics[Y_OFFSET] = YMetricArray[indI-1][indJ-1] + pBaseRead;
|
||||
|
||||
if (doViterbi) {
|
||||
bestMetricIdx = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestMetricIdx];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(metrics);
|
||||
|
||||
matchMetricArray[indI][indJ] = bestMetric;
|
||||
bestActionArrayM[indI][indJ] = ACTIONS_M[bestMetricIdx];
|
||||
|
||||
// update X array
|
||||
// State X(i,j): X(1:i) aligned to a gap in Y(1:j).
|
||||
// When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps
|
||||
c = (indJ==Y_METRIC_LENGTH-1? 0: logGapOpenProbability);
|
||||
d = (indJ==Y_METRIC_LENGTH-1? 0: logGapContinuationProbability);
|
||||
metrics[MATCH_OFFSET] = matchMetricArray[indI-1][indJ] + c;
|
||||
metrics[X_OFFSET] = XMetricArray[indI-1][indJ] + d;
|
||||
metrics[Y_OFFSET] = Double.NEGATIVE_INFINITY; //YMetricArray[indI-1][indJ] + logGapOpenProbability;
|
||||
|
||||
if (doViterbi) {
|
||||
bestMetricIdx = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestMetricIdx];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(metrics);
|
||||
|
||||
XMetricArray[indI][indJ] = bestMetric;
|
||||
bestActionArrayX[indI][indJ] = ACTIONS_X[bestMetricIdx];
|
||||
|
||||
// update Y array
|
||||
c = (indI==X_METRIC_LENGTH-1? 0: logGapOpenProbability);
|
||||
d = (indI==X_METRIC_LENGTH-1? 0: logGapContinuationProbability);
|
||||
|
||||
metrics[MATCH_OFFSET] = matchMetricArray[indI][indJ-1] + c;
|
||||
metrics[X_OFFSET] = Double.NEGATIVE_INFINITY; //XMetricArray[indI][indJ-1] + logGapOpenProbability;
|
||||
metrics[Y_OFFSET] = YMetricArray[indI][indJ-1] + d;
|
||||
|
||||
if (doViterbi) {
|
||||
bestMetricIdx = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestMetricIdx];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(metrics);
|
||||
|
||||
YMetricArray[indI][indJ] = bestMetric;
|
||||
bestActionArrayY[indI][indJ] = ACTIONS_Y[bestMetricIdx];
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
double bestMetric;
|
||||
double metrics[] = new double[3];
|
||||
int bestTable=0, bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1;
|
||||
metrics[MATCH_OFFSET] = matchMetricArray[bestI][bestJ];
|
||||
metrics[X_OFFSET] = XMetricArray[bestI][bestJ];
|
||||
metrics[Y_OFFSET] = YMetricArray[bestI][bestJ];
|
||||
if (doViterbi) {
|
||||
bestTable = MathUtils.maxElementIndex(metrics);
|
||||
bestMetric = metrics[bestTable];
|
||||
}
|
||||
else
|
||||
bestMetric = MathUtils.softMax(metrics);
|
||||
|
||||
// Do traceback (needed only for debugging!)
|
||||
if (DEBUG && doViterbi) {
|
||||
|
||||
int bestAction;
|
||||
int i = bestI;
|
||||
int j = bestJ;
|
||||
|
||||
|
||||
System.out.println("Affine gap NW");
|
||||
|
||||
|
||||
String haplotypeString = new String (haplotypeBases);
|
||||
String readString = new String(readBases);
|
||||
|
||||
|
||||
while (i >0 || j >0) {
|
||||
if (bestTable == X_OFFSET) {
|
||||
// insert gap in Y
|
||||
haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j);
|
||||
bestAction = bestActionArrayX[i][j];
|
||||
}
|
||||
else if (bestTable == Y_OFFSET) {
|
||||
readString = readString.substring(0,i)+"-"+readString.substring(i);
|
||||
bestAction = bestActionArrayY[i][j];
|
||||
|
||||
}
|
||||
else {
|
||||
bestAction = bestActionArrayM[i][j];
|
||||
}
|
||||
System.out.print(bestAction);
|
||||
|
||||
|
||||
// bestAction contains action to take at next step
|
||||
// encoding of bestAction: upper 2 bits = direction, lower 2 bits = next table
|
||||
|
||||
// bestTable and nextDirection for next step
|
||||
bestTable = bestAction & 0x3;
|
||||
int nextDirection = bestAction >> 2;
|
||||
if (nextDirection == UP) {
|
||||
i--;
|
||||
} else if (nextDirection == LEFT) {
|
||||
j--;
|
||||
} else { // if (nextDirection == DIAG)
|
||||
i--; j--;
|
||||
}
|
||||
|
||||
/* if (j<0 || i < 0)
|
||||
{
|
||||
int k=1;
|
||||
}*/
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
System.out.println("\nAlignment: ");
|
||||
System.out.println("R:"+readString);
|
||||
System.out.println("H:"+haplotypeString);
|
||||
System.out.println();
|
||||
|
||||
|
||||
}
|
||||
if (DEBUG)
|
||||
System.out.format("Likelihood: %5.4f\n", bestMetric);
|
||||
|
||||
return bestMetric;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC, ReferenceContext ref){
|
||||
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
|
||||
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
|
||||
int readIdx=0;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("Reference bases:");
|
||||
System.out.println(new String(ref.getBases()));
|
||||
}
|
||||
for (SAMRecord read : pileup.getReads()) {
|
||||
|
||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||
if (DEBUG)
|
||||
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
|
||||
read.getAlignmentStart(),
|
||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||
read.getCigarString());
|
||||
|
||||
|
||||
// get bases of candidate haplotypes that overlap with reads
|
||||
int offset = trailingBases / 2;
|
||||
long readStart = read.getUnclippedStart();
|
||||
long readEnd = read.getUnclippedEnd();
|
||||
|
||||
int numStartSoftClippedBases =0, numEndSoftClippedBases = 0;
|
||||
if (!useSoftClippedBases) {
|
||||
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
|
||||
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
|
||||
|
||||
}
|
||||
|
||||
byte[] unclippedReadBases, unclippedReadQuals;
|
||||
|
||||
int numStartClippedBases = numStartSoftClippedBases;
|
||||
int numEndClippedBases = numEndSoftClippedBases;
|
||||
unclippedReadBases = read.getReadBases();
|
||||
unclippedReadQuals = read.getBaseQualities();
|
||||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
|
||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||
numStartClippedBases++;
|
||||
else
|
||||
break;
|
||||
|
||||
}
|
||||
for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
|
||||
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
|
||||
numEndClippedBases++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
long start = Math.max(readStart + numStartClippedBases - offset - ReadUtils.getFirstInsertionOffset(read), 0);
|
||||
long stop = readEnd -numEndClippedBases + offset + ReadUtils.getLastInsertionOffset(read);
|
||||
|
||||
// ok, we now figured out total number of clipped bases on both ends.
|
||||
// Figure out where we want to place the haplotype to score read against
|
||||
if (DEBUG)
|
||||
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
|
||||
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
|
||||
|
||||
// Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
|
||||
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
|
||||
// check if start of read will be before start of reference context
|
||||
if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
|
||||
start = ref.getWindow().getStart();
|
||||
|
||||
// check also if end of read will go beyond reference context
|
||||
if (stop > ref.getWindow().getStop())
|
||||
stop = ref.getWindow().getStop();
|
||||
|
||||
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
|
||||
// but we want to compute likelihoods in the whole region that a read might overlap
|
||||
if (stop <= start + readLength) {
|
||||
stop = start + readLength-1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
byte[] readAlignedBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getReadBases()); // Adjust the read bases based on the Cigar string
|
||||
byte[] readAlignedQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getBaseQualities()); // Shift the location of the qual scores based on the Cigar string
|
||||
|
||||
|
||||
System.out.println("Aligned bases:"+new String(readAlignedBases));
|
||||
*/
|
||||
|
||||
|
||||
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
|
||||
if (DEBUG)
|
||||
System.out.println("BAD READ!!");
|
||||
|
||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||
readLikelihoods[readIdx][j]= 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("Read bases:");
|
||||
System.out.println(new String(readBases));
|
||||
}
|
||||
|
||||
|
||||
// start and stop have indices into
|
||||
|
||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||
Haplotype haplotype = haplotypesInVC.get(j);
|
||||
|
||||
|
||||
// cut haplotype bases
|
||||
//long indStart = start - haplotype.getStartPosition();
|
||||
//long indStop = stop - haplotype.getStartPosition();
|
||||
|
||||
//long indStart = 0;
|
||||
//long indStop = 400;
|
||||
//byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
|
||||
// (int)indStart, (int)indStop);
|
||||
byte[] haplotypeBases = haplotype.getBasesAsBytes();
|
||||
//BAQ.BAQCalculationResult baqRes = baq.calcBAQFromHMM(read, haplotypeBases, (int)(start - readStart));
|
||||
/* if (s1 != null && s2 != null) {
|
||||
haplotypeBases = s2.getBytes();
|
||||
readBases = s1.getBytes();
|
||||
readQuals = null;
|
||||
readQuals = new byte[readBases.length];
|
||||
for (int i=0; i < readQuals.length; i++)
|
||||
readQuals[i] = (byte)30;
|
||||
|
||||
byte[] iq = new byte[readBases.length];
|
||||
byte[] q = new byte[readBases.length];
|
||||
int[] state = new int[readBases.length];
|
||||
int a=baq.hmm_glocal(haplotypeBases, readBases, 0, readBases.length, iq, state, q);
|
||||
System.out.println("BAQ State:");
|
||||
for (int i=0; i < state.length; i++)
|
||||
System.out.format("%d ",state[i]);
|
||||
System.out.println();
|
||||
System.out.println("BAQ Q:");
|
||||
for (int i=0; i < q.length; i++)
|
||||
System.out.format("%d ",(int)q[i]);
|
||||
System.out.println();
|
||||
|
||||
} */
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("Haplotype to test:");
|
||||
System.out.println(new String(haplotypeBases));
|
||||
}
|
||||
|
||||
if (useAffineGapModel)
|
||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals);
|
||||
else
|
||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
readIdx++;
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("\nLikelihood summary");
|
||||
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
|
||||
System.out.format("Read Index: %d ",readIdx);
|
||||
for (int i=0; i < readLikelihoods[readIdx].length; i++)
|
||||
System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]);
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
}
|
||||
for (int i=0; i < haplotypesInVC.size(); i++) {
|
||||
for (int j=i; j < haplotypesInVC.size(); j++){
|
||||
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
||||
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
|
||||
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
|
||||
double[] readLikelihood = new double[2]; // diploid sample
|
||||
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
|
||||
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
if (Double.isInfinite(readLikelihoods[readIdx][i]) || Double.isInfinite(readLikelihoods[readIdx][j]))
|
||||
continue;
|
||||
haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
|
||||
readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return haplotypeLikehoodMatrix;
|
||||
|
||||
}
|
||||
|
||||
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
||||
int hSize = haplotypeLikehoodMatrix.length;
|
||||
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
|
||||
|
||||
int k=0;
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for (int i=0; i < hSize; i++) {
|
||||
for (int j=i; j < hSize; j++){
|
||||
genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
|
||||
if (haplotypeLikehoodMatrix[i][j] > maxElement)
|
||||
maxElement = haplotypeLikehoodMatrix[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// renormalize
|
||||
for (int i=0; i < genotypeLikelihoods.length; i++)
|
||||
genotypeLikelihoods[i] -= maxElement;
|
||||
|
||||
return genotypeLikelihoods;
|
||||
|
||||
}
|
||||
|
||||
/* Retrieves bases and qualities from a read that align to a start and stop position within a given contig */
|
||||
public static Pair<byte[],byte[]> getBasesBetweenPositions(SAMRecord read, long startPos, long endPos ) {
|
||||
|
||||
final Cigar cigar = read.getCigar();
|
||||
final byte[] bases = read.getReadBases();
|
||||
final byte[] quals = read.getBaseQualities();
|
||||
// final byte[] alignment = new byte[alignmentLength];
|
||||
long alignStartPos = startPos;
|
||||
long readStartPos = read.getUnclippedStart();
|
||||
|
||||
if (alignStartPos < readStartPos )
|
||||
alignStartPos = readStartPos;
|
||||
|
||||
long alignEndPos = endPos;
|
||||
long readEndPos = read.getUnclippedEnd();
|
||||
|
||||
if (alignEndPos > readEndPos )
|
||||
alignEndPos = readEndPos;
|
||||
|
||||
// We need to now locate read bases between alignStartPos and alignEndPos
|
||||
|
||||
//first see how many bytes will we require for output
|
||||
final byte[] readAlignment = new byte[bases.length];
|
||||
final byte[] qualAlignment = new byte[bases.length];
|
||||
|
||||
int outPos = 0;
|
||||
int readPos = 0;
|
||||
int alignPos = 0;
|
||||
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
|
||||
|
||||
final CigarElement ce = cigar.getCigarElement(iii);
|
||||
final int elementLength = ce.getLength();
|
||||
|
||||
switch( ce.getOperator() ) {
|
||||
case I:
|
||||
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
|
||||
// check if insertion falls before window we're looking for
|
||||
if (readStartPos +readPos >= alignStartPos) {
|
||||
readAlignment[outPos] = bases[readPos];
|
||||
qualAlignment[outPos] = quals[readPos];
|
||||
outPos++;
|
||||
}
|
||||
readPos++;
|
||||
}
|
||||
break;
|
||||
case D:
|
||||
case N:
|
||||
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
|
||||
alignPos++;
|
||||
if (alignStartPos + alignPos > alignEndPos)
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case S:
|
||||
case M:
|
||||
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
|
||||
if (readStartPos +readPos >= alignStartPos) {
|
||||
readAlignment[outPos] = bases[readPos];
|
||||
qualAlignment[outPos] = quals[readPos];
|
||||
outPos++;
|
||||
alignPos++;
|
||||
}
|
||||
|
||||
readPos++;
|
||||
|
||||
if (alignStartPos + alignPos > alignEndPos)
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case H:
|
||||
case P:
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
|
||||
}
|
||||
// if we're already beyong of edge of window of interest, nothing more to do
|
||||
if (alignStartPos + alignPos > alignEndPos)
|
||||
break;
|
||||
|
||||
}
|
||||
return new Pair(Arrays.copyOf(readAlignment,outPos), Arrays.copyOf(qualAlignment,outPos));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -873,4 +873,85 @@ public class MathUtils {
|
|||
|
||||
public static double ratio(int num, int denom) { return ((double)num) / (Math.max(denom, 1)); }
|
||||
public static double ratio(long num, long denom) { return ((double)num) / (Math.max(denom, 1)); }
|
||||
|
||||
public static final double[] log10Cache;
|
||||
public static final double[] jacobianLogTable;
|
||||
public static final int JACOBIAN_LOG_TABLE_SIZE = 101;
|
||||
public static final double JACOBIAN_LOG_TABLE_STEP = 0.1;
|
||||
public static final double MAX_JACOBIAN_TOLERANCE = 10.0;
|
||||
private static final int MAXN = 10000;
|
||||
|
||||
static {
|
||||
log10Cache = new double[2*MAXN];
|
||||
jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE];
|
||||
|
||||
log10Cache[0] = Double.NEGATIVE_INFINITY;
|
||||
for (int k=1; k < 2*MAXN; k++)
|
||||
log10Cache[k] = Math.log10(k);
|
||||
|
||||
for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
|
||||
jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k)
|
||||
* JACOBIAN_LOG_TABLE_STEP));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static public double softMax(double[] vec) {
|
||||
double acc = vec[0];
|
||||
for (int k=1; k < vec.length; k++)
|
||||
acc = softMax(acc,vec[k]);
|
||||
|
||||
return acc;
|
||||
|
||||
}
|
||||
|
||||
static public double max(double x0, double x1, double x2) {
|
||||
double a = Math.max(x0,x1);
|
||||
return Math.max(a,x2);
|
||||
}
|
||||
|
||||
static public double softMax(double x0, double x1, double x2) {
|
||||
// compute naively log10(10^x[0] + 10^x[1]+...)
|
||||
// return Math.log10(MathUtils.sumLog10(vec));
|
||||
|
||||
// better approximation: do Jacobian logarithm function on data pairs
|
||||
double a = softMax(x0,x1);
|
||||
return softMax(a,x2);
|
||||
}
|
||||
|
||||
static public double softMax(double x, double y) {
|
||||
if (Double.isInfinite(x))
|
||||
return y;
|
||||
|
||||
if (Double.isInfinite(y))
|
||||
return x;
|
||||
|
||||
if (y >= x + MAX_JACOBIAN_TOLERANCE)
|
||||
return y;
|
||||
if (x >= y + MAX_JACOBIAN_TOLERANCE)
|
||||
return x;
|
||||
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
// we need to compute log10(10^x + 10^y)
|
||||
// By Jacobian logarithm identity, this is equal to
|
||||
// max(x,y) + log10(1+10^-abs(x-y))
|
||||
// we compute the second term as a table lookup
|
||||
// with integer quantization
|
||||
double diff = Math.abs(x-y);
|
||||
double t1 =x;
|
||||
if (y > x)
|
||||
t1 = y;
|
||||
// t has max(x,y)
|
||||
// we have pre-stored correction for 0,0.1,0.2,... 10.0
|
||||
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
|
||||
double t2 = jacobianLogTable[ind];
|
||||
|
||||
// gdebug+
|
||||
//double z =Math.log10(1+Math.pow(10.0,-diff));
|
||||
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
|
||||
//gdebug-
|
||||
return t1+t2;
|
||||
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,9 +6,11 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
/*
|
||||
The topology of the profile HMM:
|
||||
|
|
@ -261,7 +263,26 @@ public class BAQ {
|
|||
s[l_query+1] = sum; // the last scaling factor
|
||||
}
|
||||
|
||||
//gdbebug+
|
||||
/*
|
||||
double cac=0.;
|
||||
// undo scaling of forward probabilities to obtain plain probability of observation given model
|
||||
double[] su = new double[f[l_query].length];
|
||||
{
|
||||
double sum = 0.;
|
||||
double[] logs = new double[s.length];
|
||||
for (k=0; k < logs.length; k++) {
|
||||
logs[k] = Math.log10(s[k]);
|
||||
sum += logs[k];
|
||||
}
|
||||
for (k=0; k < f[l_query].length; k++)
|
||||
su[k]= Math.log10(f[l_query][k])+ sum;
|
||||
|
||||
cac = MathUtils.softMax(su);
|
||||
}
|
||||
System.out.format("s:%f\n",cac);
|
||||
// gdebug-
|
||||
*/
|
||||
/*** backward ***/
|
||||
// b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
|
||||
for (k = 1; k <= l_ref; ++k) {
|
||||
|
|
@ -321,7 +342,7 @@ public class BAQ {
|
|||
max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
|
||||
if (state != null) state[qstart+i-1] = max_k;
|
||||
if (q != null) {
|
||||
k = (int)(-4.343 * Math.log(1. - max) + .499);
|
||||
k = (int)(-4.343 * Math.log(1. - max) + .499); // = 10*log10(1-max)
|
||||
q[qstart+i-1] = (byte)(k > 100? 99 : (k < minBaseQual ? minBaseQual : k));
|
||||
}
|
||||
//System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")");
|
||||
|
|
@ -457,28 +478,12 @@ public class BAQ {
|
|||
}
|
||||
}
|
||||
|
||||
private static int getFirstInsertionOffset(SAMRecord read) {
|
||||
CigarElement e = read.getCigar().getCigarElement(0);
|
||||
if ( e.getOperator() == CigarOperator.I )
|
||||
return e.getLength();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
private static int getLastInsertionOffset(SAMRecord read) {
|
||||
CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1);
|
||||
if ( e.getOperator() == CigarOperator.I )
|
||||
return e.getLength();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
|
||||
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
|
||||
// start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar
|
||||
int offset = getBandWidth() / 2;
|
||||
long readStart = includeClippedBases ? read.getUnclippedStart() : read.getAlignmentStart();
|
||||
long start = Math.max(readStart - offset - getFirstInsertionOffset(read), 0);
|
||||
long stop = (includeClippedBases ? read.getUnclippedEnd() : read.getAlignmentEnd()) + offset + getLastInsertionOffset(read);
|
||||
long start = Math.max(readStart - offset - ReadUtils.getFirstInsertionOffset(read), 0);
|
||||
long stop = (includeClippedBases ? read.getUnclippedEnd() : read.getAlignmentEnd()) + offset + ReadUtils.getLastInsertionOffset(read);
|
||||
|
||||
if ( stop > refReader.getSequenceDictionary().getSequence(read.getReferenceName()).getSequenceLength() ) {
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -40,8 +40,7 @@ public class Haplotype {
|
|||
protected double[] quals = null;
|
||||
private GenomeLoc genomeLocation = null;
|
||||
private boolean isReference = false;
|
||||
public static final int LEFT_WINDOW_SIZE = 20;
|
||||
|
||||
|
||||
/**
|
||||
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
|
||||
*
|
||||
|
|
@ -100,11 +99,17 @@ public class Haplotype {
|
|||
return genomeLocation.getStart();
|
||||
}
|
||||
|
||||
public long getStopPosition() {
|
||||
return genomeLocation.getStop();
|
||||
}
|
||||
|
||||
|
||||
public boolean isReference() {
|
||||
return isReference;
|
||||
}
|
||||
|
||||
public static List<Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref, final int haplotypeSize) {
|
||||
public static List<Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
|
||||
final int haplotypeSize, final int numPrefBases) {
|
||||
|
||||
|
||||
List<Haplotype> haplotypeList = new ArrayList<Haplotype>();
|
||||
|
|
@ -124,12 +129,11 @@ public class Haplotype {
|
|||
|
||||
byte[] refBases = ref.getBases();
|
||||
|
||||
int numPrefBases = LEFT_WINDOW_SIZE;
|
||||
|
||||
int startIdxInReference = (int)(1+startPos-numPrefBases-ref.getWindow().getStart());
|
||||
//int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event
|
||||
|
||||
|
||||
|
||||
byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases);
|
||||
byte[] basesAfterVariant = Arrays.copyOfRange(refBases,
|
||||
startIdxInReference+numPrefBases+ refAllele.getBases().length, refBases.length);
|
||||
|
|
|
|||
|
|
@ -228,4 +228,24 @@ public class ReadUtils {
|
|||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
|
||||
public final static int getFirstInsertionOffset(SAMRecord read) {
|
||||
CigarElement e = read.getCigar().getCigarElement(0);
|
||||
if ( e.getOperator() == CigarOperator.I )
|
||||
return e.getLength();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
public final static int getLastInsertionOffset(SAMRecord read) {
|
||||
CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1);
|
||||
if ( e.getOperator() == CigarOperator.I )
|
||||
return e.getLength();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue