diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java index 728a6b6df..1d1fb3e6d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java @@ -30,7 +30,10 @@ import org.apache.log4j.Logger; import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; +import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.genotype.Haplotype; @@ -46,25 +49,39 @@ import java.util.*; public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final int maxReadDeletionLength = 3; - private final int HAPLOTYPE_SIZE = 80; + private final int HAPLOTYPE_SIZE; private final int minIndelCountForGenotyping; private final boolean getAlleleListFromVCF; // todo - the following need to be exposed for command line argument control private final double indelHeterozygosity = 1.0/8000; boolean useFlatPriors = true; - private static final boolean DEBUGOUT = false; - private static final boolean DEBUG = false; + + private boolean DEBUG = false; + + // todo -cleanup private HaplotypeIndelErrorModel model; + private PairHMMIndelErrorModel pairModel; + private boolean newLike = false; + private GenomeLoc lastSiteVisited; private List alleleList; + protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); - model = new HaplotypeIndelErrorModel(maxReadDeletionLength, UAC.INSERTION_START_PROBABILITY, - UAC.INSERTION_END_PROBABILITY, UAC.ALPHA_DELETION_PROBABILITY, HAPLOTYPE_SIZE, false, DEBUGOUT); + if (UAC.newlike) { + pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.S1, UAC.S2, UAC.dovit); + newLike = true; + } + else + model = new HaplotypeIndelErrorModel(maxReadDeletionLength, UAC.INSERTION_START_PROBABILITY, + UAC.INSERTION_END_PROBABILITY,UAC.ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO); alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; + HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; + DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; } @@ -300,11 +317,15 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo if (eventLength<0) eventLength = - eventLength; - int currentHaplotypeSize = HAPLOTYPE_SIZE; - // int numSamples = getNSamples(contexts); - List haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), - ref, currentHaplotypeSize); + List haplotypesInVC; + if (newLike) + haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), + ref, HAPLOTYPE_SIZE, HAPLOTYPE_SIZE/2); + else + haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), + ref, HAPLOTYPE_SIZE, 20); + // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. // initialize the GenotypeLikelihoods @@ -316,7 +337,7 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo priors = new DiploidIndelGenotypePriors(); } else - priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,currentHaplotypeSize); + priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,HAPLOTYPE_SIZE); //double[] priorLikelihoods = priors.getPriors(); @@ -330,7 +351,11 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo pileup = context.getBasePileup(); if (pileup != null ) { - haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC); + + if (newLike) + haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref); + else + haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC); double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix); @@ -342,7 +367,8 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo genotypeLikelihoods[1], genotypeLikelihoods[2], getFilteredDepth(pileup))); - //System.out.format("%4.2f %4.2f %4.2f\n",genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]); + if (DEBUG) + System.out.format("Sample:%s GL:%4.2f %4.2f %4.2f\n",sample.getKey(), genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index ef6cbf82f..f57b4f40d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -57,29 +57,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private boolean SIMPLE_GREEDY_GENOTYPER = false; - private static final double[] log10Cache; - private static final double[] jacobianLogTable; -// private static final int JACOBIAN_LOG_TABLE_SIZE = 100001; -// private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001; - private static final int JACOBIAN_LOG_TABLE_SIZE = 101; - private static final double JACOBIAN_LOG_TABLE_STEP = 0.1; - private static final double MAX_JACOBIAN_TOLERANCE = 10.0; - private static final int MAXN = 10000; // todo -- warning, this might be hit at some point... - static { - log10Cache = new double[2*MAXN]; - jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; - - log10Cache[0] = Double.NEGATIVE_INFINITY; - for (int k=1; k < 2*MAXN; k++) - log10Cache[k] = Math.log10(k); - - for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { - jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) * JACOBIAN_LOG_TABLE_STEP)); - - } - - } + final private ExactCalculation calcToUse; protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { @@ -314,19 +293,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { for ( int j=1; j <= numSamples; j++ ) { final double[] gl = genotypeLikelihoods[j]; - final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1]; + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; double aa = Double.NEGATIVE_INFINITY; double ab = Double.NEGATIVE_INFINITY; if (k < 2*j-1) - aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()]; + aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()]; if (k < 2*j) - ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()]; + ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()]; double log10Max; if (k > 1) { - final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()]; + final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()]; log10Max = approximateLog10SumLog10(aa, ab, bb); } else { // we know we aren't considering the BB case, so we can use an optimized log10 function @@ -372,7 +351,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY ) return big; - if (big >= small + MAX_JACOBIAN_TOLERANCE) + if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE) return big; // OK, so |y-x| < tol: we use the following identity then: @@ -383,14 +362,15 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // with integer quantization // we have pre-stored correction for 0,0.1,0.2,... 10.0 //final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding - int ind = (int)(Math.round((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding + int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding //double z =Math.log10(1+Math.pow(10.0,-diff)); //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - return big + jacobianLogTable[ind]; + return big + MathUtils.jacobianLogTable[ind]; } + /** * Can be overridden by concrete subclasses * @param vc variant context with genotype likelihoods @@ -549,7 +529,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector(); //double logDenominator = Math.log10(2.0*j*(2.0*j-1)); - double logDenominator = log10Cache[2*j] + log10Cache[2*j-1]; + double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; // special treatment for k=0: iteration reduces to: //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; @@ -561,25 +541,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { double logNumerator[]; logNumerator = new double[3]; if (k < 2*j-1) - logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] + + logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] + genotypeLikelihoods[GenotypeType.AA.ordinal()]; else logNumerator[0] = Double.NEGATIVE_INFINITY; if (k < 2*j) - logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + + logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + genotypeLikelihoods[GenotypeType.AB.ordinal()]; else logNumerator[1] = Double.NEGATIVE_INFINITY; if (k > 1) - logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] + + logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] + genotypeLikelihoods[GenotypeType.BB.ordinal()]; else logNumerator[2] = Double.NEGATIVE_INFINITY; - double logNum = softMax(logNumerator); + double logNum = MathUtils.softMax(logNumerator); //YMatrix[j][k] = num/den; logYMatrix[j][k] = logNum - logDenominator; @@ -602,47 +582,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - double softMax(double[] vec) { - // compute naively log10(10^x[0] + 10^x[1]+...) - // return Math.log10(MathUtils.sumLog10(vec)); - - // better approximation: do Jacobian logarithm function on data pairs - double a = softMaxPair(vec[0],vec[1]); - return softMaxPair(a,vec[2]); - } - - static public double softMaxPair(double x, double y) { - if (Double.isInfinite(x)) - return y; - - if (Double.isInfinite(y)) - return x; - - if (y >= x + MAX_JACOBIAN_TOLERANCE) - return y; - if (x >= y + MAX_JACOBIAN_TOLERANCE) - return x; - - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - double diff = Math.abs(x-y); - double t1 =x; - if (y > x) - t1 = y; - // t has max(x,y) - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP); - double t2 = jacobianLogTable[ind]; - - // gdebug+ - //double z =Math.log10(1+Math.pow(10.0,-diff)); - //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - //gdebug- - return t1+t2; - // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 9e6cc24e6..488c24fc0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -95,6 +95,7 @@ public class UnifiedArgumentCollection { @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) public double INDEL_HETEROZYGOSITY = 1.0/8000; + // todo - remove these 3 arguments @Argument(fullName = "insertionStartProbability", shortName = "insertionStartProbability", doc = "Heterozygosity for indel calling", required = false) public double INSERTION_START_PROBABILITY = 1e-3; @@ -104,6 +105,33 @@ public class UnifiedArgumentCollection { @Argument(fullName = "alphaDeletionProbability", shortName = "alphaDeletionProbability", doc = "Heterozygosity for indel calling", required = false) public double ALPHA_DELETION_PROBABILITY = 1e-3; + + @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty", required = false) + public double INDEL_GAP_CONTINUATION_PENALTY = 10.0; + + @Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty", required = false) + public double INDEL_GAP_OPEN_PENALTY = 40.0; + + @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) + public int INDEL_HAPLOTYPE_SIZE = 80; + //gdebug+ + @Hidden + @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) + public boolean OUTPUT_DEBUG_INDEL_INFO = false; + @Hidden + @Argument(fullName = "s1", shortName = "s1", doc = "Output indel debug info", required = false) + public String S1 = null; + @Hidden + @Argument(fullName = "s2", shortName = "s2", doc = "Output indel debug info", required = false) + public String S2 = null; + @Hidden + @Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false) + public boolean dovit = false; + @Hidden + @Argument(fullName = "newlike", shortName = "newlike", doc = "Output indel debug info", required = false) + public boolean newlike = false; + //gdebug- + @Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false) public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL; @@ -135,13 +163,20 @@ public class UnifiedArgumentCollection { uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; + uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; + uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; + uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; + uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; + + // todo- arguments to remove + uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT; uac.INSERTION_START_PROBABILITY = INSERTION_START_PROBABILITY; uac.INSERTION_END_PROBABILITY = INSERTION_END_PROBABILITY; uac.ALPHA_DELETION_PROBABILITY = ALPHA_DELETION_PROBABILITY; - uac.EXACT_CALCULATION_TYPE = EXACT_CALCULATION_TYPE; - - uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT; - + uac.S1 = S1; + uac.S2 = S2; + uac.dovit = dovit; + return uac; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index 4ef8214cc..34f8ffe85 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -453,7 +453,7 @@ public class HaplotypeIndelErrorModel { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. // Second term is a constant added to both likelihoods so will be ignored - haplotypeLikehoodMatrix[i][j] += ExactAFCalculationModel.softMaxPair(readLikelihood[0], + haplotypeLikehoodMatrix[i][j] += MathUtils.softMax(readLikelihood[0], readLikelihood[1]); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java new file mode 100755 index 000000000..6db196150 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -0,0 +1,773 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.genotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.Arrays; +import java.util.List; + + +public class PairHMMIndelErrorModel { + + + private final int BASE_QUAL_THRESHOLD = 4; + + + private static final int MATCH_OFFSET = 0; + private static final int X_OFFSET = 1; + private static final int Y_OFFSET = 2; + + private static final int DIAG = 0; + private static final int UP = 1; + private static final int LEFT = 2; + + private static final int DIAG_GOTO_M = 0; + private static final int DIAG_GOTO_X = 1; + private static final int DIAG_GOTO_Y = 2; + + private static final int UP_GOTO_M = 4; + private static final int UP_GOTO_X = 5; + private static final int UP_GOTO_Y = 6; + + private static final int LEFT_GOTO_M = 8; + private static final int LEFT_GOTO_X = 9; + private static final int LEFT_GOTO_Y = 10; + + private static final int[] ACTIONS_M = {DIAG_GOTO_M, DIAG_GOTO_X, DIAG_GOTO_Y}; + private static final int[] ACTIONS_X = {UP_GOTO_M, UP_GOTO_X, UP_GOTO_Y}; + private static final int[] ACTIONS_Y = {LEFT_GOTO_M, LEFT_GOTO_X, LEFT_GOTO_Y}; + + + private final double logGapOpenProbability; + private final double logGapContinuationProbability; + + private boolean DEBUG = false; + + private static final int MAX_CACHED_QUAL = 127; + + private static final double baseMatchArray[]; + private static final double baseMismatchArray[]; + + private final static double LOG_ONE_THIRD; + private final static double LOG_ONE_HALF; + /* + private final double c; + private final double d; + private final double e; + private final double oneMinusTwoEps; + */ + private final int trailingBases = 3; + + private boolean doViterbi = false; + private final boolean useSoftClippedBases = false; + private final boolean useAffineGapModel = true; + + private String s1; + private String s2; + + // private BAQ baq; + + static { + LOG_ONE_THIRD= -Math.log10(3.0); + LOG_ONE_HALF= -Math.log10(2.0); + final double OFFSET = -Math.log10(0.25); + + baseMatchArray = new double[MAX_CACHED_QUAL+1]; + baseMismatchArray = new double[MAX_CACHED_QUAL+1]; + for (int k=1; k <= MAX_CACHED_QUAL; k++) { + double baseProb = Math.pow(10, -k/10.); + + + baseMatchArray[k] = Math.log10(1-baseProb)+ OFFSET; + baseMismatchArray[k] = Math.log10(baseProb)+LOG_ONE_THIRD+OFFSET; + } + } + + public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, String s1, String s2, boolean dovit) { + this(indelGOP, indelGCP, deb); + this.s1 = s1; + this.s2 = s2; + this.doViterbi = dovit; + } + + public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb) { + + + this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob + this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob + + this.DEBUG = deb; + + +// baq = new BAQ(Math.pow(10.0,logGapOpenProbability),Math.pow(10.0,logGapContinuationProbability), + // 3,(byte)BASE_QUAL_THRESHOLD,true); + + } + + public double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) { + final int X_METRIC_LENGTH = readBases.length+1; + final int Y_METRIC_LENGTH = haplotypeBases.length+1; + + // initialize path metric and traceback memories for likelihood computation + double[][] pathMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + int[][] bestMetricArray = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + pathMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY; + + for (int i=1; i < X_METRIC_LENGTH; i++) { + pathMetricArray[i][0] = 0; + bestMetricArray[i][0] = UP; + } + + for (int j=1; j < Y_METRIC_LENGTH; j++) { + pathMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability; + bestMetricArray[0][j] = LEFT; + } + + for (int indI=1; indI < X_METRIC_LENGTH; indI++) { + for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) { + + byte x = readBases[indI-1]; + byte y = haplotypeBases[indJ-1]; + byte qual = readQuals[indI-1]; + + double bestMetric = 0.0; + int bestMetricIdx = 0; + + // compute metric for match/mismatch + // workaround for reads whose bases quality = 0, + if (qual < 1) + qual = 1; + + if (qual > MAX_CACHED_QUAL) + qual = MAX_CACHED_QUAL; + + double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; + double[] metrics = new double[3]; + + metrics[DIAG] = pathMetricArray[indI-1][indJ-1] + pBaseRead; + metrics[UP] = pathMetricArray[indI-1][indJ] + logGapOpenProbability;//(end?0.0:logGapOpenProbability); + metrics[LEFT] = pathMetricArray[indI][indJ-1] + logGapOpenProbability;//(end?0.0:logGapOpenProbability); + + if (doViterbi) { + bestMetricIdx = MathUtils.maxElementIndex(metrics); + bestMetric = metrics[bestMetricIdx]; + } + else + bestMetric = MathUtils.softMax(metrics); + + pathMetricArray[indI][indJ] = bestMetric; + bestMetricArray[indI][indJ] = bestMetricIdx; + + } + } + + + double bestMetric=0.0; + int bestMetricIdx=0,bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1; + + for (int i=0; i < X_METRIC_LENGTH; i ++ ) { + int j= Y_METRIC_LENGTH-1; + + if (pathMetricArray[i][j] > bestMetric) { + bestMetric = pathMetricArray[i][j]; + bestI = i; + bestJ = j; + } + } + for (int j=0; j < Y_METRIC_LENGTH; j++ ) { + int i= X_METRIC_LENGTH-1; + if (pathMetricArray[i][j] >= bestMetric) { + bestMetric = pathMetricArray[i][j]; + bestI = i; + bestJ = j; + } + } + + if (DEBUG && doViterbi) { + + String haplotypeString = new String (haplotypeBases); + String readString = new String(readBases); + + + int i = bestI; + int j = bestJ; + + + System.out.println("Simple NW"); + + while (i >0 || j >0) { + bestMetricIdx = bestMetricArray[i][j]; + System.out.print(bestMetricIdx); + if (bestMetricIdx == UP) { + // insert gap in Y + haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j); + i--; + } else if (bestMetricIdx == LEFT) { + readString = readString.substring(0,i)+"-"+readString.substring(i); + j--; + } + else { + i--; j--; + } + } + + + + + System.out.println("\nAlignment: "); + System.out.println("R:"+readString); + System.out.println("H:"+haplotypeString); + System.out.println(); + + + } + if (DEBUG) + System.out.format("Likelihood: %5.4f\n", bestMetric); + + return bestMetric; + + + } + + public double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) { + + final int X_METRIC_LENGTH = readBases.length+1; + final int Y_METRIC_LENGTH = haplotypeBases.length+1; + + // initialize path metric and traceback memories for likelihood computation + double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + int[][] bestActionArrayM = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + int[][] bestActionArrayX = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + int[][] bestActionArrayY = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + matchMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY; + + for (int i=1; i < X_METRIC_LENGTH; i++) { + //initialize first column + matchMetricArray[i][0] = Double.NEGATIVE_INFINITY; + YMetricArray[i][0] = Double.NEGATIVE_INFINITY; + XMetricArray[i][0] = 0;//logGapOpenProbability + (i-1)*logGapContinuationProbability; + + bestActionArrayX[i][0] = bestActionArrayY[i][0] = bestActionArrayM[i][0] = UP_GOTO_X; + } + + for (int j=1; j < Y_METRIC_LENGTH; j++) { + // initialize first row + matchMetricArray[0][j] = Double.NEGATIVE_INFINITY; + XMetricArray[0][j] = Double.NEGATIVE_INFINITY; + YMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability; + + bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y; + } + + for (int indI=1; indI < X_METRIC_LENGTH; indI++) { + for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) { + + byte x = readBases[indI-1]; + byte y = haplotypeBases[indJ-1]; + byte qual = readQuals[indI-1]; + + double bestMetric = 0.0; + int bestMetricIdx = 0; + + // compute metric for match/mismatch + // workaround for reads whose bases quality = 0, + if (qual < 1) + qual = 1; + + if (qual > MAX_CACHED_QUAL) + qual = MAX_CACHED_QUAL; + + double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; + + + double c,d; + double[] metrics = new double[3]; + + // update match array + metrics[MATCH_OFFSET] = matchMetricArray[indI-1][indJ-1] + pBaseRead; + metrics[X_OFFSET] = XMetricArray[indI-1][indJ-1] + pBaseRead; + metrics[Y_OFFSET] = YMetricArray[indI-1][indJ-1] + pBaseRead; + + if (doViterbi) { + bestMetricIdx = MathUtils.maxElementIndex(metrics); + bestMetric = metrics[bestMetricIdx]; + } + else + bestMetric = MathUtils.softMax(metrics); + + matchMetricArray[indI][indJ] = bestMetric; + bestActionArrayM[indI][indJ] = ACTIONS_M[bestMetricIdx]; + + // update X array + // State X(i,j): X(1:i) aligned to a gap in Y(1:j). + // When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps + c = (indJ==Y_METRIC_LENGTH-1? 0: logGapOpenProbability); + d = (indJ==Y_METRIC_LENGTH-1? 0: logGapContinuationProbability); + metrics[MATCH_OFFSET] = matchMetricArray[indI-1][indJ] + c; + metrics[X_OFFSET] = XMetricArray[indI-1][indJ] + d; + metrics[Y_OFFSET] = Double.NEGATIVE_INFINITY; //YMetricArray[indI-1][indJ] + logGapOpenProbability; + + if (doViterbi) { + bestMetricIdx = MathUtils.maxElementIndex(metrics); + bestMetric = metrics[bestMetricIdx]; + } + else + bestMetric = MathUtils.softMax(metrics); + + XMetricArray[indI][indJ] = bestMetric; + bestActionArrayX[indI][indJ] = ACTIONS_X[bestMetricIdx]; + + // update Y array + c = (indI==X_METRIC_LENGTH-1? 0: logGapOpenProbability); + d = (indI==X_METRIC_LENGTH-1? 0: logGapContinuationProbability); + + metrics[MATCH_OFFSET] = matchMetricArray[indI][indJ-1] + c; + metrics[X_OFFSET] = Double.NEGATIVE_INFINITY; //XMetricArray[indI][indJ-1] + logGapOpenProbability; + metrics[Y_OFFSET] = YMetricArray[indI][indJ-1] + d; + + if (doViterbi) { + bestMetricIdx = MathUtils.maxElementIndex(metrics); + bestMetric = metrics[bestMetricIdx]; + } + else + bestMetric = MathUtils.softMax(metrics); + + YMetricArray[indI][indJ] = bestMetric; + bestActionArrayY[indI][indJ] = ACTIONS_Y[bestMetricIdx]; + + + + } + } + + double bestMetric; + double metrics[] = new double[3]; + int bestTable=0, bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1; + metrics[MATCH_OFFSET] = matchMetricArray[bestI][bestJ]; + metrics[X_OFFSET] = XMetricArray[bestI][bestJ]; + metrics[Y_OFFSET] = YMetricArray[bestI][bestJ]; + if (doViterbi) { + bestTable = MathUtils.maxElementIndex(metrics); + bestMetric = metrics[bestTable]; + } + else + bestMetric = MathUtils.softMax(metrics); + + // Do traceback (needed only for debugging!) + if (DEBUG && doViterbi) { + + int bestAction; + int i = bestI; + int j = bestJ; + + + System.out.println("Affine gap NW"); + + + String haplotypeString = new String (haplotypeBases); + String readString = new String(readBases); + + + while (i >0 || j >0) { + if (bestTable == X_OFFSET) { + // insert gap in Y + haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j); + bestAction = bestActionArrayX[i][j]; + } + else if (bestTable == Y_OFFSET) { + readString = readString.substring(0,i)+"-"+readString.substring(i); + bestAction = bestActionArrayY[i][j]; + + } + else { + bestAction = bestActionArrayM[i][j]; + } + System.out.print(bestAction); + + + // bestAction contains action to take at next step + // encoding of bestAction: upper 2 bits = direction, lower 2 bits = next table + + // bestTable and nextDirection for next step + bestTable = bestAction & 0x3; + int nextDirection = bestAction >> 2; + if (nextDirection == UP) { + i--; + } else if (nextDirection == LEFT) { + j--; + } else { // if (nextDirection == DIAG) + i--; j--; + } + + /* if (j<0 || i < 0) + { + int k=1; + }*/ + } + + + + + System.out.println("\nAlignment: "); + System.out.println("R:"+readString); + System.out.println("H:"+haplotypeString); + System.out.println(); + + + } + if (DEBUG) + System.out.format("Likelihood: %5.4f\n", bestMetric); + + return bestMetric; + + } + + + public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List haplotypesInVC, ReferenceContext ref){ + double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; + double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; + int readIdx=0; + + if (DEBUG) { + System.out.println("Reference bases:"); + System.out.println(new String(ref.getBases())); + } + for (SAMRecord read : pileup.getReads()) { + + // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) + // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent + if (DEBUG) + System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(), + read.getAlignmentStart(), + read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), + read.getCigarString()); + + + // get bases of candidate haplotypes that overlap with reads + int offset = trailingBases / 2; + long readStart = read.getUnclippedStart(); + long readEnd = read.getUnclippedEnd(); + + int numStartSoftClippedBases =0, numEndSoftClippedBases = 0; + if (!useSoftClippedBases) { + numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart(); + numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd(); + + } + + byte[] unclippedReadBases, unclippedReadQuals; + + int numStartClippedBases = numStartSoftClippedBases; + int numEndClippedBases = numEndSoftClippedBases; + unclippedReadBases = read.getReadBases(); + unclippedReadQuals = read.getBaseQualities(); + + // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, + // and may leave a string of Q2 bases still hanging off the reads. + for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) { + if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) + numStartClippedBases++; + else + break; + + } + for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){ + if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) + numEndClippedBases++; + else + break; + } + + long start = Math.max(readStart + numStartClippedBases - offset - ReadUtils.getFirstInsertionOffset(read), 0); + long stop = readEnd -numEndClippedBases + offset + ReadUtils.getLastInsertionOffset(read); + + // ok, we now figured out total number of clipped bases on both ends. + // Figure out where we want to place the haplotype to score read against + if (DEBUG) + System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); + + // Variables start and stop are coordinates (inclusive) where we want to get the haplotype from. + int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; + // check if start of read will be before start of reference context + if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut + start = ref.getWindow().getStart(); + + // check also if end of read will go beyond reference context + if (stop > ref.getWindow().getStop()) + stop = ref.getWindow().getStop(); + + // if there's an insertion in the read, the read stop position will be less than start + read legnth, + // but we want to compute likelihoods in the whole region that a read might overlap + if (stop <= start + readLength) { + stop = start + readLength-1; + } + + + + /* + byte[] readAlignedBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getReadBases()); // Adjust the read bases based on the Cigar string + byte[] readAlignedQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), read.getBaseQualities()); // Shift the location of the qual scores based on the Cigar string + + + System.out.println("Aligned bases:"+new String(readAlignedBases)); + */ + + + if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) { + if (DEBUG) + System.out.println("BAD READ!!"); + + for (int j=0; j < haplotypesInVC.size(); j++) { + readLikelihoods[readIdx][j]= 0; + } + } + else { + byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases, + unclippedReadBases.length-numEndClippedBases); + + byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases, + unclippedReadBases.length-numEndClippedBases); + + + if (DEBUG) { + System.out.println("Read bases:"); + System.out.println(new String(readBases)); + } + + + // start and stop have indices into + + for (int j=0; j < haplotypesInVC.size(); j++) { + Haplotype haplotype = haplotypesInVC.get(j); + + + // cut haplotype bases + //long indStart = start - haplotype.getStartPosition(); + //long indStop = stop - haplotype.getStartPosition(); + + //long indStart = 0; + //long indStop = 400; + //byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(), + // (int)indStart, (int)indStop); + byte[] haplotypeBases = haplotype.getBasesAsBytes(); + //BAQ.BAQCalculationResult baqRes = baq.calcBAQFromHMM(read, haplotypeBases, (int)(start - readStart)); + /* if (s1 != null && s2 != null) { + haplotypeBases = s2.getBytes(); + readBases = s1.getBytes(); + readQuals = null; + readQuals = new byte[readBases.length]; + for (int i=0; i < readQuals.length; i++) + readQuals[i] = (byte)30; + + byte[] iq = new byte[readBases.length]; + byte[] q = new byte[readBases.length]; + int[] state = new int[readBases.length]; + int a=baq.hmm_glocal(haplotypeBases, readBases, 0, readBases.length, iq, state, q); + System.out.println("BAQ State:"); + for (int i=0; i < state.length; i++) + System.out.format("%d ",state[i]); + System.out.println(); + System.out.println("BAQ Q:"); + for (int i=0; i < q.length; i++) + System.out.format("%d ",(int)q[i]); + System.out.println(); + + } */ + + if (DEBUG) { + System.out.println("Haplotype to test:"); + System.out.println(new String(haplotypeBases)); + } + + if (useAffineGapModel) + readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals); + else + readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals); + + + } + } + + readIdx++; + } + + if (DEBUG) { + System.out.println("\nLikelihood summary"); + for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { + System.out.format("Read Index: %d ",readIdx); + for (int i=0; i < readLikelihoods[readIdx].length; i++) + System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]); + System.out.println(); + } + + } + for (int i=0; i < haplotypesInVC.size(); i++) { + for (int j=i; j < haplotypesInVC.size(); j++){ + // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] + // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) + //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) + double[] readLikelihood = new double[2]; // diploid sample + for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { + + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + if (Double.isInfinite(readLikelihoods[readIdx][i]) || Double.isInfinite(readLikelihoods[readIdx][j])) + continue; + haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i], + readLikelihoods[readIdx][j]) + LOG_ONE_HALF); + + } + + + } + } + + return haplotypeLikehoodMatrix; + + } + + public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) { + int hSize = haplotypeLikehoodMatrix.length; + double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2]; + + int k=0; + double maxElement = Double.NEGATIVE_INFINITY; + for (int i=0; i < hSize; i++) { + for (int j=i; j < hSize; j++){ + genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j]; + if (haplotypeLikehoodMatrix[i][j] > maxElement) + maxElement = haplotypeLikehoodMatrix[i][j]; + } + } + + // renormalize + for (int i=0; i < genotypeLikelihoods.length; i++) + genotypeLikelihoods[i] -= maxElement; + + return genotypeLikelihoods; + + } + + /* Retrieves bases and qualities from a read that align to a start and stop position within a given contig */ + public static Pair getBasesBetweenPositions(SAMRecord read, long startPos, long endPos ) { + + final Cigar cigar = read.getCigar(); + final byte[] bases = read.getReadBases(); + final byte[] quals = read.getBaseQualities(); + // final byte[] alignment = new byte[alignmentLength]; + long alignStartPos = startPos; + long readStartPos = read.getUnclippedStart(); + + if (alignStartPos < readStartPos ) + alignStartPos = readStartPos; + + long alignEndPos = endPos; + long readEndPos = read.getUnclippedEnd(); + + if (alignEndPos > readEndPos ) + alignEndPos = readEndPos; + + // We need to now locate read bases between alignStartPos and alignEndPos + + //first see how many bytes will we require for output + final byte[] readAlignment = new byte[bases.length]; + final byte[] qualAlignment = new byte[bases.length]; + + int outPos = 0; + int readPos = 0; + int alignPos = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + // check if insertion falls before window we're looking for + if (readStartPos +readPos >= alignStartPos) { + readAlignment[outPos] = bases[readPos]; + qualAlignment[outPos] = quals[readPos]; + outPos++; + } + readPos++; + } + break; + case D: + case N: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignPos++; + if (alignStartPos + alignPos > alignEndPos) + break; + } + break; + case S: + case M: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + if (readStartPos +readPos >= alignStartPos) { + readAlignment[outPos] = bases[readPos]; + qualAlignment[outPos] = quals[readPos]; + outPos++; + alignPos++; + } + + readPos++; + + if (alignStartPos + alignPos > alignEndPos) + break; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + // if we're already beyong of edge of window of interest, nothing more to do + if (alignStartPos + alignPos > alignEndPos) + break; + + } + return new Pair(Arrays.copyOf(readAlignment,outPos), Arrays.copyOf(qualAlignment,outPos)); + } + +} diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index cb54f0d60..68017b039 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -873,4 +873,85 @@ public class MathUtils { public static double ratio(int num, int denom) { return ((double)num) / (Math.max(denom, 1)); } public static double ratio(long num, long denom) { return ((double)num) / (Math.max(denom, 1)); } + + public static final double[] log10Cache; + public static final double[] jacobianLogTable; + public static final int JACOBIAN_LOG_TABLE_SIZE = 101; + public static final double JACOBIAN_LOG_TABLE_STEP = 0.1; + public static final double MAX_JACOBIAN_TOLERANCE = 10.0; + private static final int MAXN = 10000; + + static { + log10Cache = new double[2*MAXN]; + jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; + + log10Cache[0] = Double.NEGATIVE_INFINITY; + for (int k=1; k < 2*MAXN; k++) + log10Cache[k] = Math.log10(k); + + for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { + jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) + * JACOBIAN_LOG_TABLE_STEP)); + + } } + + static public double softMax(double[] vec) { + double acc = vec[0]; + for (int k=1; k < vec.length; k++) + acc = softMax(acc,vec[k]); + + return acc; + + } + + static public double max(double x0, double x1, double x2) { + double a = Math.max(x0,x1); + return Math.max(a,x2); + } + + static public double softMax(double x0, double x1, double x2) { + // compute naively log10(10^x[0] + 10^x[1]+...) + // return Math.log10(MathUtils.sumLog10(vec)); + + // better approximation: do Jacobian logarithm function on data pairs + double a = softMax(x0,x1); + return softMax(a,x2); + } + + static public double softMax(double x, double y) { + if (Double.isInfinite(x)) + return y; + + if (Double.isInfinite(y)) + return x; + + if (y >= x + MAX_JACOBIAN_TOLERANCE) + return y; + if (x >= y + MAX_JACOBIAN_TOLERANCE) + return x; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup + // with integer quantization + double diff = Math.abs(x-y); + double t1 =x; + if (y > x) + t1 = y; + // t has max(x,y) + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP); + double t2 = jacobianLogTable[ind]; + + // gdebug+ + //double z =Math.log10(1+Math.pow(10.0,-diff)); + //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); + //gdebug- + return t1+t2; + // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); + } + +} diff --git a/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index fb3f760a2..425e410d5 100644 --- a/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -6,9 +6,11 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ReadUtils; /* The topology of the profile HMM: @@ -261,7 +263,26 @@ public class BAQ { s[l_query+1] = sum; // the last scaling factor } + //gdbebug+ +/* + double cac=0.; + // undo scaling of forward probabilities to obtain plain probability of observation given model + double[] su = new double[f[l_query].length]; + { + double sum = 0.; + double[] logs = new double[s.length]; + for (k=0; k < logs.length; k++) { + logs[k] = Math.log10(s[k]); + sum += logs[k]; + } + for (k=0; k < f[l_query].length; k++) + su[k]= Math.log10(f[l_query][k])+ sum; + cac = MathUtils.softMax(su); + } + System.out.format("s:%f\n",cac); + // gdebug- + */ /*** backward ***/ // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from) for (k = 1; k <= l_ref; ++k) { @@ -321,7 +342,7 @@ public class BAQ { max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 if (state != null) state[qstart+i-1] = max_k; if (q != null) { - k = (int)(-4.343 * Math.log(1. - max) + .499); + k = (int)(-4.343 * Math.log(1. - max) + .499); // = 10*log10(1-max) q[qstart+i-1] = (byte)(k > 100? 99 : (k < minBaseQual ? minBaseQual : k)); } //System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")"); @@ -457,28 +478,12 @@ public class BAQ { } } - private static int getFirstInsertionOffset(SAMRecord read) { - CigarElement e = read.getCigar().getCigarElement(0); - if ( e.getOperator() == CigarOperator.I ) - return e.getLength(); - else - return 0; - } - - private static int getLastInsertionOffset(SAMRecord read) { - CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1); - if ( e.getOperator() == CigarOperator.I ) - return e.getLength(); - else - return 0; - } - - public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) { + public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) { // start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar int offset = getBandWidth() / 2; long readStart = includeClippedBases ? read.getUnclippedStart() : read.getAlignmentStart(); - long start = Math.max(readStart - offset - getFirstInsertionOffset(read), 0); - long stop = (includeClippedBases ? read.getUnclippedEnd() : read.getAlignmentEnd()) + offset + getLastInsertionOffset(read); + long start = Math.max(readStart - offset - ReadUtils.getFirstInsertionOffset(read), 0); + long stop = (includeClippedBases ? read.getUnclippedEnd() : read.getAlignmentEnd()) + offset + ReadUtils.getLastInsertionOffset(read); if ( stop > refReader.getSequenceDictionary().getSequence(read.getReferenceName()).getSequenceLength() ) { return null; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index 6d8bf11de..e758ff032 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -40,8 +40,7 @@ public class Haplotype { protected double[] quals = null; private GenomeLoc genomeLocation = null; private boolean isReference = false; - public static final int LEFT_WINDOW_SIZE = 20; - + /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual * @@ -100,11 +99,17 @@ public class Haplotype { return genomeLocation.getStart(); } + public long getStopPosition() { + return genomeLocation.getStop(); + } + + public boolean isReference() { return isReference; } - public static List makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, final int haplotypeSize) { + public static List makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, + final int haplotypeSize, final int numPrefBases) { List haplotypeList = new ArrayList(); @@ -124,12 +129,11 @@ public class Haplotype { byte[] refBases = ref.getBases(); - int numPrefBases = LEFT_WINDOW_SIZE; int startIdxInReference = (int)(1+startPos-numPrefBases-ref.getWindow().getStart()); //int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event - + byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases); byte[] basesAfterVariant = Arrays.copyOfRange(refBases, startIdxInReference+numPrefBases+ refAllele.getBases().length, refBases.length); diff --git a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 50da4890d..faf990b3e 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -228,4 +228,24 @@ public class ReadUtils { return Collections.emptyList(); } } + + public final static int getFirstInsertionOffset(SAMRecord read) { + CigarElement e = read.getCigar().getCigarElement(0); + if ( e.getOperator() == CigarOperator.I ) + return e.getLength(); + else + return 0; + } + + public final static int getLastInsertionOffset(SAMRecord read) { + CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1); + if ( e.getOperator() == CigarOperator.I ) + return e.getLength(); + else + return 0; + } + + + + }