Initial commit of new likelihood model to evaluate indel quality. Principle is simple, a plain Pair HMM with affine gap penalties (in log space) that does quasi-local alignment between reads and candidate haplotypes and which in theory should be more solid and more reliable than the older Dindel-based model. It also allows to be easily extensible in the future if we decide to introduce either context-dependent and/or read-dependent gap penalties.

Model is disabled by default and we're still using the old Dindel model until I'm more confident that new model is a definitive improvement, so right now this is enabled by hidden command line arguments, and it's not to be used yet.

In detail:
a) Several refactorings to share softMax() available to other modules, so its now part of MathUtils.
b) Refactored a couple of read utilities and moved from BAQ to ReadUtils.
c) New PairHMMIndelErrorModel class implementing new likelihood model
d) Several new hidden debug arguments in UAC.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5389 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2011-03-07 15:31:58 +00:00
parent 96fe540d66
commit 8c262eb605
9 changed files with 1000 additions and 119 deletions

View File

@ -30,7 +30,10 @@ import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
@ -46,25 +49,39 @@ import java.util.*;
public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private final int maxReadDeletionLength = 3;
private final int HAPLOTYPE_SIZE = 80;
private final int HAPLOTYPE_SIZE;
private final int minIndelCountForGenotyping;
private final boolean getAlleleListFromVCF;
// todo - the following need to be exposed for command line argument control
private final double indelHeterozygosity = 1.0/8000;
boolean useFlatPriors = true;
private static final boolean DEBUGOUT = false;
private static final boolean DEBUG = false;
private boolean DEBUG = false;
// todo -cleanup
private HaplotypeIndelErrorModel model;
private PairHMMIndelErrorModel pairModel;
private boolean newLike = false;
private GenomeLoc lastSiteVisited;
private List<Allele> alleleList;
protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
model = new HaplotypeIndelErrorModel(maxReadDeletionLength, UAC.INSERTION_START_PROBABILITY,
UAC.INSERTION_END_PROBABILITY, UAC.ALPHA_DELETION_PROBABILITY, HAPLOTYPE_SIZE, false, DEBUGOUT);
if (UAC.newlike) {
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.S1, UAC.S2, UAC.dovit);
newLike = true;
}
else
model = new HaplotypeIndelErrorModel(maxReadDeletionLength, UAC.INSERTION_START_PROBABILITY,
UAC.INSERTION_END_PROBABILITY,UAC.ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
alleleList = new ArrayList<Allele>();
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
}
@ -300,11 +317,15 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
if (eventLength<0)
eventLength = - eventLength;
int currentHaplotypeSize = HAPLOTYPE_SIZE;
// int numSamples = getNSamples(contexts);
List<Haplotype> haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
ref, currentHaplotypeSize);
List<Haplotype> haplotypesInVC;
if (newLike)
haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
ref, HAPLOTYPE_SIZE, HAPLOTYPE_SIZE/2);
else
haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
ref, HAPLOTYPE_SIZE, 20);
// For each sample, get genotype likelihoods based on pileup
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
// initialize the GenotypeLikelihoods
@ -316,7 +337,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
priors = new DiploidIndelGenotypePriors();
}
else
priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,currentHaplotypeSize);
priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,HAPLOTYPE_SIZE);
//double[] priorLikelihoods = priors.getPriors();
@ -330,7 +351,11 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
pileup = context.getBasePileup();
if (pileup != null ) {
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);
if (newLike)
haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref);
else
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);
@ -342,7 +367,8 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
genotypeLikelihoods[1],
genotypeLikelihoods[2],
getFilteredDepth(pileup)));
//System.out.format("%4.2f %4.2f %4.2f\n",genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]);
if (DEBUG)
System.out.format("Sample:%s GL:%4.2f %4.2f %4.2f\n",sample.getKey(), genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]);
}
}

View File

@ -57,29 +57,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private boolean SIMPLE_GREEDY_GENOTYPER = false;
private static final double[] log10Cache;
private static final double[] jacobianLogTable;
// private static final int JACOBIAN_LOG_TABLE_SIZE = 100001;
// private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001;
private static final int JACOBIAN_LOG_TABLE_SIZE = 101;
private static final double JACOBIAN_LOG_TABLE_STEP = 0.1;
private static final double MAX_JACOBIAN_TOLERANCE = 10.0;
private static final int MAXN = 10000; // todo -- warning, this might be hit at some point...
static {
log10Cache = new double[2*MAXN];
jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE];
log10Cache[0] = Double.NEGATIVE_INFINITY;
for (int k=1; k < 2*MAXN; k++)
log10Cache[k] = Math.log10(k);
for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) * JACOBIAN_LOG_TABLE_STEP));
}
}
final private ExactCalculation calcToUse;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
@ -314,19 +293,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods[j];
final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
double ab = Double.NEGATIVE_INFINITY;
if (k < 2*j-1)
aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
if (k < 2*j)
ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
double log10Max;
if (k > 1) {
final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
log10Max = approximateLog10SumLog10(aa, ab, bb);
} else {
// we know we aren't considering the BB case, so we can use an optimized log10 function
@ -372,7 +351,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY )
return big;
if (big >= small + MAX_JACOBIAN_TOLERANCE)
if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE)
return big;
// OK, so |y-x| < tol: we use the following identity then:
@ -383,14 +362,15 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// with integer quantization
// we have pre-stored correction for 0,0.1,0.2,... 10.0
//final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
int ind = (int)(Math.round((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
return big + jacobianLogTable[ind];
return big + MathUtils.jacobianLogTable[ind];
}
/**
* Can be overridden by concrete subclasses
* @param vc variant context with genotype likelihoods
@ -549,7 +529,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
// special treatment for k=0: iteration reduces to:
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
@ -561,25 +541,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
double logNumerator[];
logNumerator = new double[3];
if (k < 2*j-1)
logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
genotypeLikelihoods[GenotypeType.AA.ordinal()];
else
logNumerator[0] = Double.NEGATIVE_INFINITY;
if (k < 2*j)
logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
genotypeLikelihoods[GenotypeType.AB.ordinal()];
else
logNumerator[1] = Double.NEGATIVE_INFINITY;
if (k > 1)
logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] +
logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] +
genotypeLikelihoods[GenotypeType.BB.ordinal()];
else
logNumerator[2] = Double.NEGATIVE_INFINITY;
double logNum = softMax(logNumerator);
double logNum = MathUtils.softMax(logNumerator);
//YMatrix[j][k] = num/den;
logYMatrix[j][k] = logNum - logDenominator;
@ -602,47 +582,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
double softMax(double[] vec) {
// compute naively log10(10^x[0] + 10^x[1]+...)
// return Math.log10(MathUtils.sumLog10(vec));
// better approximation: do Jacobian logarithm function on data pairs
double a = softMaxPair(vec[0],vec[1]);
return softMaxPair(a,vec[2]);
}
static public double softMaxPair(double x, double y) {
if (Double.isInfinite(x))
return y;
if (Double.isInfinite(y))
return x;
if (y >= x + MAX_JACOBIAN_TOLERANCE)
return y;
if (x >= y + MAX_JACOBIAN_TOLERANCE)
return x;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
double diff = Math.abs(x-y);
double t1 =x;
if (y > x)
t1 = y;
// t has max(x,y)
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
double t2 = jacobianLogTable[ind];
// gdebug+
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
//gdebug-
return t1+t2;
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
}
}

View File

@ -95,6 +95,7 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
public double INDEL_HETEROZYGOSITY = 1.0/8000;
// todo - remove these 3 arguments
@Argument(fullName = "insertionStartProbability", shortName = "insertionStartProbability", doc = "Heterozygosity for indel calling", required = false)
public double INSERTION_START_PROBABILITY = 1e-3;
@ -104,6 +105,33 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "alphaDeletionProbability", shortName = "alphaDeletionProbability", doc = "Heterozygosity for indel calling", required = false)
public double ALPHA_DELETION_PROBABILITY = 1e-3;
@Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty", required = false)
public double INDEL_GAP_CONTINUATION_PENALTY = 10.0;
@Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty", required = false)
public double INDEL_GAP_OPEN_PENALTY = 40.0;
@Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false)
public int INDEL_HAPLOTYPE_SIZE = 80;
//gdebug+
@Hidden
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
@Hidden
@Argument(fullName = "s1", shortName = "s1", doc = "Output indel debug info", required = false)
public String S1 = null;
@Hidden
@Argument(fullName = "s2", shortName = "s2", doc = "Output indel debug info", required = false)
public String S2 = null;
@Hidden
@Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false)
public boolean dovit = false;
@Hidden
@Argument(fullName = "newlike", shortName = "newlike", doc = "Output indel debug info", required = false)
public boolean newlike = false;
//gdebug-
@Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false)
public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL;
@ -135,13 +163,20 @@ public class UnifiedArgumentCollection {
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING;
uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY;
uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY;
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
// todo- arguments to remove
uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT;
uac.INSERTION_START_PROBABILITY = INSERTION_START_PROBABILITY;
uac.INSERTION_END_PROBABILITY = INSERTION_END_PROBABILITY;
uac.ALPHA_DELETION_PROBABILITY = ALPHA_DELETION_PROBABILITY;
uac.EXACT_CALCULATION_TYPE = EXACT_CALCULATION_TYPE;
uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT;
uac.S1 = S1;
uac.S2 = S2;
uac.dovit = dovit;
return uac;
}

View File

@ -453,7 +453,7 @@ public class HaplotypeIndelErrorModel {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
// Second term is a constant added to both likelihoods so will be ignored
haplotypeLikehoodMatrix[i][j] += ExactAFCalculationModel.softMaxPair(readLikelihood[0],
haplotypeLikehoodMatrix[i][j] += MathUtils.softMax(readLikelihood[0],
readLikelihood[1]);
}

View File

@ -0,0 +1,773 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.Arrays;
import java.util.List;
public class PairHMMIndelErrorModel {
private final int BASE_QUAL_THRESHOLD = 4;
private static final int MATCH_OFFSET = 0;
private static final int X_OFFSET = 1;
private static final int Y_OFFSET = 2;
private static final int DIAG = 0;
private static final int UP = 1;
private static final int LEFT = 2;
private static final int DIAG_GOTO_M = 0;
private static final int DIAG_GOTO_X = 1;
private static final int DIAG_GOTO_Y = 2;
private static final int UP_GOTO_M = 4;
private static final int UP_GOTO_X = 5;
private static final int UP_GOTO_Y = 6;
private static final int LEFT_GOTO_M = 8;
private static final int LEFT_GOTO_X = 9;
private static final int LEFT_GOTO_Y = 10;
private static final int[] ACTIONS_M = {DIAG_GOTO_M, DIAG_GOTO_X, DIAG_GOTO_Y};
private static final int[] ACTIONS_X = {UP_GOTO_M, UP_GOTO_X, UP_GOTO_Y};
private static final int[] ACTIONS_Y = {LEFT_GOTO_M, LEFT_GOTO_X, LEFT_GOTO_Y};
private final double logGapOpenProbability;
private final double logGapContinuationProbability;
private boolean DEBUG = false;
private static final int MAX_CACHED_QUAL = 127;
private static final double baseMatchArray[];
private static final double baseMismatchArray[];
private final static double LOG_ONE_THIRD;
private final static double LOG_ONE_HALF;
/*
private final double c;
private final double d;
private final double e;
private final double oneMinusTwoEps;
*/
private final int trailingBases = 3;
private boolean doViterbi = false;
private final boolean useSoftClippedBases = false;
private final boolean useAffineGapModel = true;
private String s1;
private String s2;
// private BAQ baq;
static {
LOG_ONE_THIRD= -Math.log10(3.0);
LOG_ONE_HALF= -Math.log10(2.0);
final double OFFSET = -Math.log10(0.25);
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
double baseProb = Math.pow(10, -k/10.);
baseMatchArray[k] = Math.log10(1-baseProb)+ OFFSET;
baseMismatchArray[k] = Math.log10(baseProb)+LOG_ONE_THIRD+OFFSET;
}
}
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, String s1, String s2, boolean dovit) {
this(indelGOP, indelGCP, deb);
this.s1 = s1;
this.s2 = s2;
this.doViterbi = dovit;
}
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb) {
this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob
this.DEBUG = deb;
// baq = new BAQ(Math.pow(10.0,logGapOpenProbability),Math.pow(10.0,logGapContinuationProbability),
// 3,(byte)BASE_QUAL_THRESHOLD,true);
}
public double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) {
final int X_METRIC_LENGTH = readBases.length+1;
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
// initialize path metric and traceback memories for likelihood computation
double[][] pathMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
int[][] bestMetricArray = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
pathMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY;
for (int i=1; i < X_METRIC_LENGTH; i++) {
pathMetricArray[i][0] = 0;
bestMetricArray[i][0] = UP;
}
for (int j=1; j < Y_METRIC_LENGTH; j++) {
pathMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability;
bestMetricArray[0][j] = LEFT;
}
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) {
byte x = readBases[indI-1];
byte y = haplotypeBases[indJ-1];
byte qual = readQuals[indI-1];
double bestMetric = 0.0;
int bestMetricIdx = 0;
// compute metric for match/mismatch
// workaround for reads whose bases quality = 0,
if (qual < 1)
qual = 1;
if (qual > MAX_CACHED_QUAL)
qual = MAX_CACHED_QUAL;
double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
double[] metrics = new double[3];
metrics[DIAG] = pathMetricArray[indI-1][indJ-1] + pBaseRead;
metrics[UP] = pathMetricArray[indI-1][indJ] + logGapOpenProbability;//(end?0.0:logGapOpenProbability);
metrics[LEFT] = pathMetricArray[indI][indJ-1] + logGapOpenProbability;//(end?0.0:logGapOpenProbability);
if (doViterbi) {
bestMetricIdx = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestMetricIdx];
}
else
bestMetric = MathUtils.softMax(metrics);
pathMetricArray[indI][indJ] = bestMetric;
bestMetricArray[indI][indJ] = bestMetricIdx;
}
}
double bestMetric=0.0;
int bestMetricIdx=0,bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1;
for (int i=0; i < X_METRIC_LENGTH; i ++ ) {
int j= Y_METRIC_LENGTH-1;
if (pathMetricArray[i][j] > bestMetric) {
bestMetric = pathMetricArray[i][j];
bestI = i;
bestJ = j;
}
}
for (int j=0; j < Y_METRIC_LENGTH; j++ ) {
int i= X_METRIC_LENGTH-1;
if (pathMetricArray[i][j] >= bestMetric) {
bestMetric = pathMetricArray[i][j];
bestI = i;
bestJ = j;
}
}
if (DEBUG && doViterbi) {
String haplotypeString = new String (haplotypeBases);
String readString = new String(readBases);
int i = bestI;
int j = bestJ;
System.out.println("Simple NW");
while (i >0 || j >0) {
bestMetricIdx = bestMetricArray[i][j];
System.out.print(bestMetricIdx);
if (bestMetricIdx == UP) {
// insert gap in Y
haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j);
i--;
} else if (bestMetricIdx == LEFT) {
readString = readString.substring(0,i)+"-"+readString.substring(i);
j--;
}
else {
i--; j--;
}
}
System.out.println("\nAlignment: ");
System.out.println("R:"+readString);
System.out.println("H:"+haplotypeString);
System.out.println();
}
if (DEBUG)
System.out.format("Likelihood: %5.4f\n", bestMetric);
return bestMetric;
}
public double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) {
final int X_METRIC_LENGTH = readBases.length+1;
final int Y_METRIC_LENGTH = haplotypeBases.length+1;
// initialize path metric and traceback memories for likelihood computation
double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
int[][] bestActionArrayM = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
int[][] bestActionArrayX = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
int[][] bestActionArrayY = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH];
matchMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY;
for (int i=1; i < X_METRIC_LENGTH; i++) {
//initialize first column
matchMetricArray[i][0] = Double.NEGATIVE_INFINITY;
YMetricArray[i][0] = Double.NEGATIVE_INFINITY;
XMetricArray[i][0] = 0;//logGapOpenProbability + (i-1)*logGapContinuationProbability;
bestActionArrayX[i][0] = bestActionArrayY[i][0] = bestActionArrayM[i][0] = UP_GOTO_X;
}
for (int j=1; j < Y_METRIC_LENGTH; j++) {
// initialize first row
matchMetricArray[0][j] = Double.NEGATIVE_INFINITY;
XMetricArray[0][j] = Double.NEGATIVE_INFINITY;
YMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability;
bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y;
}
for (int indI=1; indI < X_METRIC_LENGTH; indI++) {
for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) {
byte x = readBases[indI-1];
byte y = haplotypeBases[indJ-1];
byte qual = readQuals[indI-1];
double bestMetric = 0.0;
int bestMetricIdx = 0;
// compute metric for match/mismatch
// workaround for reads whose bases quality = 0,
if (qual < 1)
qual = 1;
if (qual > MAX_CACHED_QUAL)
qual = MAX_CACHED_QUAL;
double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
double c,d;
double[] metrics = new double[3];
// update match array
metrics[MATCH_OFFSET] = matchMetricArray[indI-1][indJ-1] + pBaseRead;
metrics[X_OFFSET] = XMetricArray[indI-1][indJ-1] + pBaseRead;
metrics[Y_OFFSET] = YMetricArray[indI-1][indJ-1] + pBaseRead;
if (doViterbi) {
bestMetricIdx = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestMetricIdx];
}
else
bestMetric = MathUtils.softMax(metrics);
matchMetricArray[indI][indJ] = bestMetric;
bestActionArrayM[indI][indJ] = ACTIONS_M[bestMetricIdx];
// update X array
// State X(i,j): X(1:i) aligned to a gap in Y(1:j).
// When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps
c = (indJ==Y_METRIC_LENGTH-1? 0: logGapOpenProbability);
d = (indJ==Y_METRIC_LENGTH-1? 0: logGapContinuationProbability);
metrics[MATCH_OFFSET] = matchMetricArray[indI-1][indJ] + c;
metrics[X_OFFSET] = XMetricArray[indI-1][indJ] + d;
metrics[Y_OFFSET] = Double.NEGATIVE_INFINITY; //YMetricArray[indI-1][indJ] + logGapOpenProbability;
if (doViterbi) {
bestMetricIdx = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestMetricIdx];
}
else
bestMetric = MathUtils.softMax(metrics);
XMetricArray[indI][indJ] = bestMetric;
bestActionArrayX[indI][indJ] = ACTIONS_X[bestMetricIdx];
// update Y array
c = (indI==X_METRIC_LENGTH-1? 0: logGapOpenProbability);
d = (indI==X_METRIC_LENGTH-1? 0: logGapContinuationProbability);
metrics[MATCH_OFFSET] = matchMetricArray[indI][indJ-1] + c;
metrics[X_OFFSET] = Double.NEGATIVE_INFINITY; //XMetricArray[indI][indJ-1] + logGapOpenProbability;
metrics[Y_OFFSET] = YMetricArray[indI][indJ-1] + d;
if (doViterbi) {
bestMetricIdx = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestMetricIdx];
}
else
bestMetric = MathUtils.softMax(metrics);
YMetricArray[indI][indJ] = bestMetric;
bestActionArrayY[indI][indJ] = ACTIONS_Y[bestMetricIdx];
}
}
double bestMetric;
double metrics[] = new double[3];
int bestTable=0, bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1;
metrics[MATCH_OFFSET] = matchMetricArray[bestI][bestJ];
metrics[X_OFFSET] = XMetricArray[bestI][bestJ];
metrics[Y_OFFSET] = YMetricArray[bestI][bestJ];
if (doViterbi) {
bestTable = MathUtils.maxElementIndex(metrics);
bestMetric = metrics[bestTable];
}
else
bestMetric = MathUtils.softMax(metrics);
// Do traceback (needed only for debugging!)
if (DEBUG && doViterbi) {
int bestAction;
int i = bestI;
int j = bestJ;
System.out.println("Affine gap NW");
String haplotypeString = new String (haplotypeBases);
String readString = new String(readBases);
while (i >0 || j >0) {
if (bestTable == X_OFFSET) {
// insert gap in Y
haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j);
bestAction = bestActionArrayX[i][j];
}
else if (bestTable == Y_OFFSET) {
readString = readString.substring(0,i)+"-"+readString.substring(i);
bestAction = bestActionArrayY[i][j];
}
else {
bestAction = bestActionArrayM[i][j];
}
System.out.print(bestAction);
// bestAction contains action to take at next step
// encoding of bestAction: upper 2 bits = direction, lower 2 bits = next table
// bestTable and nextDirection for next step
bestTable = bestAction & 0x3;
int nextDirection = bestAction >> 2;
if (nextDirection == UP) {
i--;
} else if (nextDirection == LEFT) {
j--;
} else { // if (nextDirection == DIAG)
i--; j--;
}
/* if (j<0 || i < 0)
{
int k=1;
}*/
}
System.out.println("\nAlignment: ");
System.out.println("R:"+readString);
System.out.println("H:"+haplotypeString);
System.out.println();
}
if (DEBUG)
System.out.format("Likelihood: %5.4f\n", bestMetric);
return bestMetric;
}
public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC, ReferenceContext ref){
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
int readIdx=0;
if (DEBUG) {
System.out.println("Reference bases:");
System.out.println(new String(ref.getBases()));
}
for (SAMRecord read : pileup.getReads()) {
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
if (DEBUG)
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
read.getAlignmentStart(),
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
read.getCigarString());
// get bases of candidate haplotypes that overlap with reads
int offset = trailingBases / 2;
long readStart = read.getUnclippedStart();
long readEnd = read.getUnclippedEnd();
int numStartSoftClippedBases =0, numEndSoftClippedBases = 0;
if (!useSoftClippedBases) {
numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
}
byte[] unclippedReadBases, unclippedReadQuals;
int numStartClippedBases = numStartSoftClippedBases;
int numEndClippedBases = numEndSoftClippedBases;
unclippedReadBases = read.getReadBases();
unclippedReadQuals = read.getBaseQualities();
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
numStartClippedBases++;
else
break;
}
for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
numEndClippedBases++;
else
break;
}
long start = Math.max(readStart + numStartClippedBases - offset - ReadUtils.getFirstInsertionOffset(read), 0);
long stop = readEnd -numEndClippedBases + offset + ReadUtils.getLastInsertionOffset(read);
// ok, we now figured out total number of clipped bases on both ends.
// Figure out where we want to place the haplotype to score read against
if (DEBUG)
System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
// Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
// check if start of read will be before start of reference context
if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
start = ref.getWindow().getStart();
// check also if end of read will go beyond reference context
if (stop > ref.getWindow().getStop())
stop = ref.getWindow().getStop();
// if there's an insertion in the read, the read stop position will be less than start + read legnth,
// but we want to compute likelihoods in the whole region that a read might overlap
if (stop <= start + readLength) {
stop = start + readLength-1;
}
/*
byte[] readAlignedBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getReadBases()); // Adjust the read bases based on the Cigar string
byte[] readAlignedQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getBaseQualities()); // Shift the location of the qual scores based on the Cigar string
System.out.println("Aligned bases:"+new String(readAlignedBases));
*/
if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
if (DEBUG)
System.out.println("BAD READ!!");
for (int j=0; j < haplotypesInVC.size(); j++) {
readLikelihoods[readIdx][j]= 0;
}
}
else {
byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
unclippedReadBases.length-numEndClippedBases);
if (DEBUG) {
System.out.println("Read bases:");
System.out.println(new String(readBases));
}
// start and stop have indices into
for (int j=0; j < haplotypesInVC.size(); j++) {
Haplotype haplotype = haplotypesInVC.get(j);
// cut haplotype bases
//long indStart = start - haplotype.getStartPosition();
//long indStop = stop - haplotype.getStartPosition();
//long indStart = 0;
//long indStop = 400;
//byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
// (int)indStart, (int)indStop);
byte[] haplotypeBases = haplotype.getBasesAsBytes();
//BAQ.BAQCalculationResult baqRes = baq.calcBAQFromHMM(read, haplotypeBases, (int)(start - readStart));
/* if (s1 != null && s2 != null) {
haplotypeBases = s2.getBytes();
readBases = s1.getBytes();
readQuals = null;
readQuals = new byte[readBases.length];
for (int i=0; i < readQuals.length; i++)
readQuals[i] = (byte)30;
byte[] iq = new byte[readBases.length];
byte[] q = new byte[readBases.length];
int[] state = new int[readBases.length];
int a=baq.hmm_glocal(haplotypeBases, readBases, 0, readBases.length, iq, state, q);
System.out.println("BAQ State:");
for (int i=0; i < state.length; i++)
System.out.format("%d ",state[i]);
System.out.println();
System.out.println("BAQ Q:");
for (int i=0; i < q.length; i++)
System.out.format("%d ",(int)q[i]);
System.out.println();
} */
if (DEBUG) {
System.out.println("Haplotype to test:");
System.out.println(new String(haplotypeBases));
}
if (useAffineGapModel)
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals);
else
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
}
}
readIdx++;
}
if (DEBUG) {
System.out.println("\nLikelihood summary");
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
System.out.format("Read Index: %d ",readIdx);
for (int i=0; i < readLikelihoods[readIdx].length; i++)
System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]);
System.out.println();
}
}
for (int i=0; i < haplotypesInVC.size(); i++) {
for (int j=i; j < haplotypesInVC.size(); j++){
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
double[] readLikelihood = new double[2]; // diploid sample
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
if (Double.isInfinite(readLikelihoods[readIdx][i]) || Double.isInfinite(readLikelihoods[readIdx][j]))
continue;
haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
}
}
}
return haplotypeLikehoodMatrix;
}
public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
int hSize = haplotypeLikehoodMatrix.length;
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
int k=0;
double maxElement = Double.NEGATIVE_INFINITY;
for (int i=0; i < hSize; i++) {
for (int j=i; j < hSize; j++){
genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
if (haplotypeLikehoodMatrix[i][j] > maxElement)
maxElement = haplotypeLikehoodMatrix[i][j];
}
}
// renormalize
for (int i=0; i < genotypeLikelihoods.length; i++)
genotypeLikelihoods[i] -= maxElement;
return genotypeLikelihoods;
}
/* Retrieves bases and qualities from a read that align to a start and stop position within a given contig */
public static Pair<byte[],byte[]> getBasesBetweenPositions(SAMRecord read, long startPos, long endPos ) {
final Cigar cigar = read.getCigar();
final byte[] bases = read.getReadBases();
final byte[] quals = read.getBaseQualities();
// final byte[] alignment = new byte[alignmentLength];
long alignStartPos = startPos;
long readStartPos = read.getUnclippedStart();
if (alignStartPos < readStartPos )
alignStartPos = readStartPos;
long alignEndPos = endPos;
long readEndPos = read.getUnclippedEnd();
if (alignEndPos > readEndPos )
alignEndPos = readEndPos;
// We need to now locate read bases between alignStartPos and alignEndPos
//first see how many bytes will we require for output
final byte[] readAlignment = new byte[bases.length];
final byte[] qualAlignment = new byte[bases.length];
int outPos = 0;
int readPos = 0;
int alignPos = 0;
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
final CigarElement ce = cigar.getCigarElement(iii);
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
// check if insertion falls before window we're looking for
if (readStartPos +readPos >= alignStartPos) {
readAlignment[outPos] = bases[readPos];
qualAlignment[outPos] = quals[readPos];
outPos++;
}
readPos++;
}
break;
case D:
case N:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
alignPos++;
if (alignStartPos + alignPos > alignEndPos)
break;
}
break;
case S:
case M:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
if (readStartPos +readPos >= alignStartPos) {
readAlignment[outPos] = bases[readPos];
qualAlignment[outPos] = quals[readPos];
outPos++;
alignPos++;
}
readPos++;
if (alignStartPos + alignPos > alignEndPos)
break;
}
break;
case H:
case P:
break;
default:
throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
}
// if we're already beyong of edge of window of interest, nothing more to do
if (alignStartPos + alignPos > alignEndPos)
break;
}
return new Pair(Arrays.copyOf(readAlignment,outPos), Arrays.copyOf(qualAlignment,outPos));
}
}

View File

@ -873,4 +873,85 @@ public class MathUtils {
public static double ratio(int num, int denom) { return ((double)num) / (Math.max(denom, 1)); }
public static double ratio(long num, long denom) { return ((double)num) / (Math.max(denom, 1)); }
public static final double[] log10Cache;
public static final double[] jacobianLogTable;
public static final int JACOBIAN_LOG_TABLE_SIZE = 101;
public static final double JACOBIAN_LOG_TABLE_STEP = 0.1;
public static final double MAX_JACOBIAN_TOLERANCE = 10.0;
private static final int MAXN = 10000;
static {
log10Cache = new double[2*MAXN];
jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE];
log10Cache[0] = Double.NEGATIVE_INFINITY;
for (int k=1; k < 2*MAXN; k++)
log10Cache[k] = Math.log10(k);
for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k)
* JACOBIAN_LOG_TABLE_STEP));
}
}
static public double softMax(double[] vec) {
double acc = vec[0];
for (int k=1; k < vec.length; k++)
acc = softMax(acc,vec[k]);
return acc;
}
static public double max(double x0, double x1, double x2) {
double a = Math.max(x0,x1);
return Math.max(a,x2);
}
static public double softMax(double x0, double x1, double x2) {
// compute naively log10(10^x[0] + 10^x[1]+...)
// return Math.log10(MathUtils.sumLog10(vec));
// better approximation: do Jacobian logarithm function on data pairs
double a = softMax(x0,x1);
return softMax(a,x2);
}
static public double softMax(double x, double y) {
if (Double.isInfinite(x))
return y;
if (Double.isInfinite(y))
return x;
if (y >= x + MAX_JACOBIAN_TOLERANCE)
return y;
if (x >= y + MAX_JACOBIAN_TOLERANCE)
return x;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
double diff = Math.abs(x-y);
double t1 =x;
if (y > x)
t1 = y;
// t has max(x,y)
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
double t2 = jacobianLogTable[ind];
// gdebug+
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
//gdebug-
return t1+t2;
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
}
}

View File

@ -6,9 +6,11 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.ReadUtils;
/*
The topology of the profile HMM:
@ -261,7 +263,26 @@ public class BAQ {
s[l_query+1] = sum; // the last scaling factor
}
//gdbebug+
/*
double cac=0.;
// undo scaling of forward probabilities to obtain plain probability of observation given model
double[] su = new double[f[l_query].length];
{
double sum = 0.;
double[] logs = new double[s.length];
for (k=0; k < logs.length; k++) {
logs[k] = Math.log10(s[k]);
sum += logs[k];
}
for (k=0; k < f[l_query].length; k++)
su[k]= Math.log10(f[l_query][k])+ sum;
cac = MathUtils.softMax(su);
}
System.out.format("s:%f\n",cac);
// gdebug-
*/
/*** backward ***/
// b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
for (k = 1; k <= l_ref; ++k) {
@ -321,7 +342,7 @@ public class BAQ {
max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
if (state != null) state[qstart+i-1] = max_k;
if (q != null) {
k = (int)(-4.343 * Math.log(1. - max) + .499);
k = (int)(-4.343 * Math.log(1. - max) + .499); // = 10*log10(1-max)
q[qstart+i-1] = (byte)(k > 100? 99 : (k < minBaseQual ? minBaseQual : k));
}
//System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")");
@ -457,28 +478,12 @@ public class BAQ {
}
}
private static int getFirstInsertionOffset(SAMRecord read) {
CigarElement e = read.getCigar().getCigarElement(0);
if ( e.getOperator() == CigarOperator.I )
return e.getLength();
else
return 0;
}
private static int getLastInsertionOffset(SAMRecord read) {
CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1);
if ( e.getOperator() == CigarOperator.I )
return e.getLength();
else
return 0;
}
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
// start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar
int offset = getBandWidth() / 2;
long readStart = includeClippedBases ? read.getUnclippedStart() : read.getAlignmentStart();
long start = Math.max(readStart - offset - getFirstInsertionOffset(read), 0);
long stop = (includeClippedBases ? read.getUnclippedEnd() : read.getAlignmentEnd()) + offset + getLastInsertionOffset(read);
long start = Math.max(readStart - offset - ReadUtils.getFirstInsertionOffset(read), 0);
long stop = (includeClippedBases ? read.getUnclippedEnd() : read.getAlignmentEnd()) + offset + ReadUtils.getLastInsertionOffset(read);
if ( stop > refReader.getSequenceDictionary().getSequence(read.getReferenceName()).getSequenceLength() ) {
return null;

View File

@ -40,8 +40,7 @@ public class Haplotype {
protected double[] quals = null;
private GenomeLoc genomeLocation = null;
private boolean isReference = false;
public static final int LEFT_WINDOW_SIZE = 20;
/**
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
*
@ -100,11 +99,17 @@ public class Haplotype {
return genomeLocation.getStart();
}
public long getStopPosition() {
return genomeLocation.getStop();
}
public boolean isReference() {
return isReference;
}
public static List<Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref, final int haplotypeSize) {
public static List<Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
final int haplotypeSize, final int numPrefBases) {
List<Haplotype> haplotypeList = new ArrayList<Haplotype>();
@ -124,12 +129,11 @@ public class Haplotype {
byte[] refBases = ref.getBases();
int numPrefBases = LEFT_WINDOW_SIZE;
int startIdxInReference = (int)(1+startPos-numPrefBases-ref.getWindow().getStart());
//int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event
byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases);
byte[] basesAfterVariant = Arrays.copyOfRange(refBases,
startIdxInReference+numPrefBases+ refAllele.getBases().length, refBases.length);

View File

@ -228,4 +228,24 @@ public class ReadUtils {
return Collections.emptyList();
}
}
public final static int getFirstInsertionOffset(SAMRecord read) {
CigarElement e = read.getCigar().getCigarElement(0);
if ( e.getOperator() == CigarOperator.I )
return e.getLength();
else
return 0;
}
public final static int getLastInsertionOffset(SAMRecord read) {
CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1);
if ( e.getOperator() == CigarOperator.I )
return e.getLength();
else
return 0;
}
}