Merge branch 'master' of ssh://gsa3/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
e9b7324dc1
|
|
@ -72,7 +72,7 @@ public class ErrorModel {
|
|||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
if (refSampleVC.isIndel()) {
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
|
||||
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
|
|||
|
||||
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
|||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -114,6 +115,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
|
||||
protected PrintStream graphWriter = null;
|
||||
|
||||
/**
|
||||
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
|
||||
*/
|
||||
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
|
||||
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
||||
protected String keepRG = null;
|
||||
|
|
@ -234,14 +241,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
samplesList.addAll( samples );
|
||||
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
|
||||
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
|
||||
UnifiedArgumentCollection simpleUAC = UAC.clone();
|
||||
UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
|
||||
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
simpleUAC.exactCallsLog = null;
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
|
|
@ -287,7 +294,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
}
|
||||
|
||||
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
|
||||
}
|
||||
|
||||
|
|
@ -400,6 +407,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
|
||||
if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
|
||||
|
||||
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
|
||||
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
|
|
|||
|
|
@ -30,6 +30,9 @@ import com.google.java.contract.Requires;
|
|||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pairhmm.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
|
@ -44,8 +47,25 @@ public class LikelihoodCalculationEngine {
|
|||
private final boolean DEBUG;
|
||||
private final PairHMM pairHMM;
|
||||
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) {
|
||||
pairHMM = new PairHMM( noBanded );
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
|
||||
|
||||
switch (hmmType) {
|
||||
case EXACT:
|
||||
pairHMM = new ExactPairHMM();
|
||||
break;
|
||||
case ORIGINAL:
|
||||
pairHMM = new OriginalPairHMM();
|
||||
break;
|
||||
case CACHING:
|
||||
pairHMM = new CachingPairHMM();
|
||||
break;
|
||||
case LOGLESS_CACHING:
|
||||
pairHMM = new LoglessCachingPairHMM();
|
||||
break;
|
||||
default:
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
|
||||
}
|
||||
|
||||
this.constantGCP = constantGCP;
|
||||
DEBUG = debug;
|
||||
}
|
||||
|
|
@ -69,23 +89,18 @@ public class LikelihoodCalculationEngine {
|
|||
X_METRIC_LENGTH += 2;
|
||||
Y_METRIC_LENGTH += 2;
|
||||
|
||||
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
|
||||
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
|
||||
|
||||
// for each sample's reads
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray );
|
||||
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() );
|
||||
}
|
||||
}
|
||||
|
||||
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final int numReads = reads.size();
|
||||
|
|
@ -113,9 +128,8 @@ public class LikelihoodCalculationEngine {
|
|||
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
|
||||
previousHaplotypeSeen = haplotype;
|
||||
|
||||
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP,
|
||||
haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
|
||||
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0);
|
||||
readCounts[jjj][iii] = readCount;
|
||||
}
|
||||
}
|
||||
|
|
@ -130,7 +144,7 @@ public class LikelihoodCalculationEngine {
|
|||
return iii;
|
||||
}
|
||||
}
|
||||
return b1.length;
|
||||
return Math.min(b1.length, b2.length);
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
|
|
@ -280,7 +294,7 @@ public class LikelihoodCalculationEngine {
|
|||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
|
||||
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
|
||||
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
|
||||
// set up the default 1-to-1 haplotype mapping object
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
|
|
@ -322,6 +336,13 @@ public class LikelihoodCalculationEngine {
|
|||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
public static int findReferenceIndex( final List<Haplotype> haplotypes ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.isReference() ) { return haplotypes.indexOf(h); }
|
||||
}
|
||||
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
|
||||
}
|
||||
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,181 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin, carneiro
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
|
||||
public class CachingPairHMM extends OriginalPairHMM {
|
||||
|
||||
double[][] constantMatrix = null; // The cache in the CachingPairHMM
|
||||
double[][] distanceMatrix = null; // The cache in the CachingPairHMM
|
||||
|
||||
protected static final double [] firstRowConstantMatrix = {
|
||||
QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)),
|
||||
QualityUtils.qualToProbLog10(DEFAULT_GCP),
|
||||
QualityUtils.qualToErrorProbLog10(DEFAULT_GOP),
|
||||
QualityUtils.qualToErrorProbLog10(DEFAULT_GCP),
|
||||
0.0,
|
||||
0.0
|
||||
};
|
||||
|
||||
@Override
|
||||
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
|
||||
|
||||
super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH);
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
|
||||
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
|
||||
|
||||
constantMatrix = new double[X_METRIC_LENGTH][6];
|
||||
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
// fill in the first row
|
||||
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
|
||||
if( recacheReadValues ) {
|
||||
initializeConstants( insertionGOP, deletionGOP, overallGCP );
|
||||
}
|
||||
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
for (int i = 2; i < X_METRIC_LENGTH; i++) {
|
||||
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
|
||||
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to the editing
|
||||
* distance between the read and the haplotype.
|
||||
*
|
||||
* @param haplotypeBases the bases of the haplotype
|
||||
* @param readBases the bases of the read
|
||||
* @param readQuals the base quality scores of the read
|
||||
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
|
||||
*/
|
||||
public void initializeDistanceMatrix( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final int startIndex ) {
|
||||
|
||||
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
|
||||
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
|
||||
|
||||
for (int i = 0; i < readBases.length; i++) {
|
||||
final byte x = readBases[i];
|
||||
final byte qual = readQuals[i];
|
||||
for (int j = startIndex; j < haplotypeBases.length; j++) {
|
||||
final byte y = haplotypeBases[j];
|
||||
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
|
||||
QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to quality scores.
|
||||
*
|
||||
* @param insertionGOP insertion quality scores of the read
|
||||
* @param deletionGOP deletion quality scores of the read
|
||||
* @param overallGCP overall gap continuation penalty
|
||||
*/
|
||||
public void initializeConstants( final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP ) {
|
||||
|
||||
final int l = insertionGOP.length;
|
||||
constantMatrix[1] = firstRowConstantMatrix;
|
||||
for (int i = 0; i < l; i++) {
|
||||
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
|
||||
constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP);
|
||||
constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]);
|
||||
constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]);
|
||||
constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
|
||||
constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]);
|
||||
constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
|
||||
}
|
||||
constantMatrix[l+1][4] = 0.0;
|
||||
constantMatrix[l+1][5] = 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates a cell in the HMM matrix
|
||||
*
|
||||
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
|
||||
* initial conditions
|
||||
|
||||
* @param indI row index in the matrices to update
|
||||
* @param indJ column index in the matrices to update
|
||||
* @param prior the likelihood editing distance matrix for the read x haplotype
|
||||
* @param constants an array with the six constants relevant to this location
|
||||
* @param matchMetricArray the matches likelihood matrix
|
||||
* @param XMetricArray the insertions likelihood matrix
|
||||
* @param YMetricArray the deletions likelihood matrix
|
||||
*/
|
||||
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
matchMetricArray[indI][indJ] = prior +
|
||||
MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0],
|
||||
XMetricArray[indI - 1][indJ - 1] + constants[1],
|
||||
YMetricArray[indI - 1][indJ - 1] + constants[1] );
|
||||
XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2],
|
||||
XMetricArray[indI - 1][indJ] + constants[3]);
|
||||
YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4],
|
||||
YMetricArray[indI][indJ - 1] + constants[5]);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,187 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin, carneiro
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
|
||||
public class LoglessCachingPairHMM extends CachingPairHMM {
|
||||
|
||||
protected static final double SCALE_FACTOR_LOG10 = 300.0;
|
||||
|
||||
protected static final double [] firstRowConstantMatrix = {
|
||||
QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)),
|
||||
QualityUtils.qualToProb(DEFAULT_GCP),
|
||||
QualityUtils.qualToErrorProb(DEFAULT_GOP),
|
||||
QualityUtils.qualToErrorProb(DEFAULT_GCP),
|
||||
1.0,
|
||||
1.0
|
||||
};
|
||||
|
||||
@Override
|
||||
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
|
||||
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
|
||||
|
||||
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
Arrays.fill(matchMetricArray[iii], 0.0);
|
||||
Arrays.fill(XMetricArray[iii], 0.0);
|
||||
Arrays.fill(YMetricArray[iii], 0.0);
|
||||
}
|
||||
|
||||
// the initial condition
|
||||
matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0);
|
||||
|
||||
constantMatrix = new double[X_METRIC_LENGTH][6];
|
||||
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
// fill in the first row
|
||||
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
|
||||
if( recacheReadValues ) {
|
||||
initializeConstants( insertionGOP, deletionGOP, overallGCP );
|
||||
}
|
||||
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
for (int i = 2; i < X_METRIC_LENGTH; i++) {
|
||||
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
|
||||
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to the editing
|
||||
* distance between the read and the haplotype.
|
||||
*
|
||||
* @param haplotypeBases the bases of the haplotype
|
||||
* @param readBases the bases of the read
|
||||
* @param readQuals the base quality scores of the read
|
||||
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
|
||||
*/
|
||||
public void initializeDistanceMatrix( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final int startIndex ) {
|
||||
|
||||
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
|
||||
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
|
||||
|
||||
for (int i = 0; i < readBases.length; i++) {
|
||||
final byte x = readBases[i];
|
||||
final byte qual = readQuals[i];
|
||||
for (int j = startIndex; j < haplotypeBases.length; j++) {
|
||||
final byte y = haplotypeBases[j];
|
||||
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
|
||||
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to quality scores.
|
||||
*
|
||||
* @param insertionGOP insertion quality scores of the read
|
||||
* @param deletionGOP deletion quality scores of the read
|
||||
* @param overallGCP overall gap continuation penalty
|
||||
*/
|
||||
public void initializeConstants( final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP ) {
|
||||
|
||||
final int l = insertionGOP.length;
|
||||
constantMatrix[1] = firstRowConstantMatrix;
|
||||
for (int i = 0; i < l; i++) {
|
||||
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
|
||||
constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
|
||||
constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
|
||||
constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
|
||||
constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
|
||||
constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
}
|
||||
constantMatrix[l+1][4] = 1.0;
|
||||
constantMatrix[l+1][5] = 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates a cell in the HMM matrix
|
||||
*
|
||||
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
|
||||
* initial conditions
|
||||
|
||||
* @param indI row index in the matrices to update
|
||||
* @param indJ column index in the matrices to update
|
||||
* @param prior the likelihood editing distance matrix for the read x haplotype
|
||||
* @param constants an array with the six constants relevant to this location
|
||||
* @param matchMetricArray the matches likelihood matrix
|
||||
* @param XMetricArray the insertions likelihood matrix
|
||||
* @param YMetricArray the deletions likelihood matrix
|
||||
*/
|
||||
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] +
|
||||
XMetricArray[indI - 1][indJ - 1] * constants[1] +
|
||||
YMetricArray[indI - 1][indJ - 1] * constants[1] );
|
||||
XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3];
|
||||
YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5];
|
||||
}
|
||||
}
|
||||
|
|
@ -70,7 +70,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c54c0c9411054bf629bfd98b616e53fc"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -23,24 +23,26 @@
|
|||
*/
|
||||
|
||||
// our package
|
||||
package org.broadinstitute.sting.utils;
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class PairHMMUnitTest extends BaseTest {
|
||||
final static boolean EXTENSIVE_TESTING = true;
|
||||
PairHMM hmm = new PairHMM( false ); // reference implementation
|
||||
PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding
|
||||
PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation
|
||||
PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation
|
||||
PairHMM cachingHMM = new CachingPairHMM();
|
||||
PairHMM loglessHMM = new LoglessCachingPairHMM();
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC";
|
||||
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA";
|
||||
|
||||
public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
|
||||
public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) {
|
||||
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
|
||||
}
|
||||
|
||||
|
|
@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
public double expectedLogL() {
|
||||
return expectedQual / -10.0;
|
||||
return (expectedQual / -10.0) + 0.03 ;
|
||||
}
|
||||
|
||||
public double tolerance() {
|
||||
return 0.1; // TODO FIXME arbitrary
|
||||
public double toleranceFromTheoretical() {
|
||||
return 0.2;
|
||||
}
|
||||
|
||||
public double calcLogL() {
|
||||
public double toleranceFromReference() {
|
||||
return 1E-4;
|
||||
}
|
||||
|
||||
double logL = hmm.computeReadLikelihoodGivenHaplotype(
|
||||
public double toleranceFromExact() {
|
||||
return 1E-9;
|
||||
}
|
||||
|
||||
public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) {
|
||||
pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length);
|
||||
return pairHMM.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
refBasesWithContext, readBasesWithContext,
|
||||
qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true),
|
||||
qualAsBytes(gcp, false));
|
||||
|
||||
return logL;
|
||||
qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel),
|
||||
qualAsBytes(gcp, false, anchorIndel), 0, true);
|
||||
}
|
||||
|
||||
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
|
||||
return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
|
||||
}
|
||||
|
||||
private byte[] qualAsBytes(final int phredQual, final boolean doGOP) {
|
||||
private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) {
|
||||
final byte phredQuals[] = new byte[readBasesWithContext.length];
|
||||
// initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
|
||||
Arrays.fill(phredQuals, (byte)100);
|
||||
|
||||
// update just the bases corresponding to the provided micro read with the quality scores
|
||||
if( doGOP ) {
|
||||
phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
|
||||
} else {
|
||||
for ( int i = 0; i < read.length(); i++)
|
||||
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
|
||||
}
|
||||
if( anchorIndel ) {
|
||||
// initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
|
||||
Arrays.fill(phredQuals, (byte)100);
|
||||
|
||||
return phredQuals;
|
||||
}
|
||||
}
|
||||
|
||||
final Random random = new Random(87865573);
|
||||
private class BandedLikelihoodTestProvider extends TestDataProvider {
|
||||
final String ref, read;
|
||||
final byte[] refBasesWithContext, readBasesWithContext;
|
||||
final int baseQual, insQual, delQual, gcp;
|
||||
final int expectedQual;
|
||||
final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC";
|
||||
final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA";
|
||||
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT";
|
||||
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA";
|
||||
final byte[] baseQuals, insQuals, delQuals, gcps;
|
||||
|
||||
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
|
||||
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
|
||||
}
|
||||
|
||||
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) {
|
||||
super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual));
|
||||
this.baseQual = baseQual;
|
||||
this.delQual = delQual;
|
||||
this.insQual = insQual;
|
||||
this.gcp = gcp;
|
||||
this.read = read;
|
||||
this.ref = ref;
|
||||
this.expectedQual = expectedQual;
|
||||
|
||||
refBasesWithContext = asBytes(ref, left, right);
|
||||
readBasesWithContext = asBytes(read, false, false);
|
||||
baseQuals = qualAsBytes(baseQual);
|
||||
insQuals = qualAsBytes(insQual);
|
||||
delQuals = qualAsBytes(delQual);
|
||||
gcps = qualAsBytes(gcp, false);
|
||||
}
|
||||
|
||||
public double expectedLogL() {
|
||||
double logL = hmm.computeReadLikelihoodGivenHaplotype(
|
||||
refBasesWithContext, readBasesWithContext,
|
||||
baseQuals, insQuals, delQuals, gcps);
|
||||
|
||||
return logL;
|
||||
}
|
||||
|
||||
public double tolerance() {
|
||||
return 0.2; // TODO FIXME arbitrary
|
||||
}
|
||||
|
||||
public double calcLogL() {
|
||||
|
||||
double logL = bandedHMM.computeReadLikelihoodGivenHaplotype(
|
||||
refBasesWithContext, readBasesWithContext,
|
||||
baseQuals, insQuals, delQuals, gcps);
|
||||
|
||||
return logL;
|
||||
}
|
||||
|
||||
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
|
||||
return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
|
||||
}
|
||||
|
||||
private byte[] qualAsBytes(final int phredQual) {
|
||||
return qualAsBytes(phredQual, true);
|
||||
}
|
||||
|
||||
private byte[] qualAsBytes(final int phredQual, final boolean addRandom) {
|
||||
final byte phredQuals[] = new byte[readBasesWithContext.length];
|
||||
Arrays.fill(phredQuals, (byte)phredQual);
|
||||
if(addRandom) {
|
||||
for( int iii = 0; iii < phredQuals.length; iii++) {
|
||||
phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3));
|
||||
// update just the bases corresponding to the provided micro read with the quality scores
|
||||
if( doGOP ) {
|
||||
phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
|
||||
} else {
|
||||
for ( int i = 0; i < read.length(); i++)
|
||||
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
|
||||
}
|
||||
} else {
|
||||
Arrays.fill(phredQuals, (byte)phredQual);
|
||||
}
|
||||
|
||||
return phredQuals;
|
||||
}
|
||||
}
|
||||
|
|
@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
// test all combinations
|
||||
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30);
|
||||
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40);
|
||||
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
|
||||
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
|
||||
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10);
|
||||
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2);
|
||||
|
||||
for ( final int baseQual : baseQuals ) {
|
||||
for ( final int indelQual : indelQuals ) {
|
||||
|
|
@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
|
||||
for ( boolean insertionP : Arrays.asList(true, false)) {
|
||||
final String small = Utils.dupString((char)base, 1);
|
||||
final String big = Utils.dupString((char)base, size);
|
||||
final String big = Utils.dupString((char) base, size);
|
||||
|
||||
final String ref = insertionP ? small : big;
|
||||
final String read = insertionP ? big : small;
|
||||
|
|
@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
|
||||
public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
|
||||
double calculatedLogL = cfg.calcLogL();
|
||||
double expectedLogL = cfg.expectedLogL();
|
||||
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
|
||||
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
|
||||
}
|
||||
|
||||
@DataProvider(name = "BandedLikelihoodTestProvider")
|
||||
public Object[][] makeBandedLikelihoodTests() {
|
||||
final Random random = new Random(87860573);
|
||||
@DataProvider(name = "OptimizedLikelihoodTestProvider")
|
||||
public Object[][] makeOptimizedLikelihoodTests() {
|
||||
// context on either side is ACGTTGCA REF ACGTTGCA
|
||||
// test all combinations
|
||||
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30);
|
||||
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40);
|
||||
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10);
|
||||
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
|
||||
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30);
|
||||
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40);
|
||||
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
|
||||
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2);
|
||||
|
||||
for ( final int baseQual : baseQuals ) {
|
||||
for ( final int indelQual : indelQuals ) {
|
||||
for ( final int gcp : gcps ) {
|
||||
|
||||
// test substitutions
|
||||
for ( final byte refBase : BaseUtils.BASES ) {
|
||||
for ( final byte readBase : BaseUtils.BASES ) {
|
||||
final String ref = new String(new byte[]{refBase});
|
||||
final String read = new String(new byte[]{readBase});
|
||||
final int expected = refBase == readBase ? 0 : baseQual;
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
|
||||
}
|
||||
}
|
||||
|
||||
// test insertions and deletions
|
||||
for ( final int size : sizes ) {
|
||||
for ( final byte base : BaseUtils.BASES ) {
|
||||
final int expected = indelQual + (size - 2) * gcp;
|
||||
|
||||
for ( boolean insertionP : Arrays.asList(true, false)) {
|
||||
final String small = Utils.dupString((char)base, 1);
|
||||
final String big = Utils.dupString((char)base, size);
|
||||
|
||||
final String ref = insertionP ? small : big;
|
||||
final String read = insertionP ? big : small;
|
||||
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false);
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true);
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true);
|
||||
for ( final int refSize : sizes ) {
|
||||
for ( final int readSize : sizes ) {
|
||||
String ref = "";
|
||||
String read = "";
|
||||
for( int iii = 0; iii < refSize; iii++) {
|
||||
ref += (char) BaseUtils.BASES[random.nextInt(4)];
|
||||
}
|
||||
for( int iii = 0; iii < readSize; iii++) {
|
||||
read += (char) BaseUtils.BASES[random.nextInt(4)];
|
||||
}
|
||||
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp);
|
||||
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false);
|
||||
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true);
|
||||
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class);
|
||||
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true)
|
||||
public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) {
|
||||
double calculatedLogL = cfg.calcLogL();
|
||||
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
|
||||
public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
|
||||
double exactLogL = cfg.calcLogL( exactHMM, true );
|
||||
double calculatedLogL = cfg.calcLogL( originalHMM, true );
|
||||
double optimizedLogL = cfg.calcLogL( cachingHMM, true );
|
||||
double loglessLogL = cfg.calcLogL( loglessHMM, true );
|
||||
double expectedLogL = cfg.expectedLogL();
|
||||
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
|
||||
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
|
||||
//logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
|
||||
Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical());
|
||||
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical());
|
||||
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
|
||||
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
|
||||
}
|
||||
|
||||
@Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true)
|
||||
public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) {
|
||||
double exactLogL = cfg.calcLogL( exactHMM, false );
|
||||
double calculatedLogL = cfg.calcLogL( originalHMM, false );
|
||||
double optimizedLogL = cfg.calcLogL( cachingHMM, false );
|
||||
double loglessLogL = cfg.calcLogL( loglessHMM, false );
|
||||
//logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
|
||||
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
|
||||
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
|
||||
// change single base at position k to C. If it's a C, change to T
|
||||
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
|
||||
double res1 = hmm.computeReadLikelihoodGivenHaplotype(
|
||||
originalHMM.initialize(mread.length, haplotype1.length);
|
||||
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
haplotype1, mread,
|
||||
quals, gop, gop,
|
||||
gcp);
|
||||
|
||||
gcp, 0, false);
|
||||
|
||||
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
|
||||
|
||||
|
|
@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
|
||||
// change single base at position k to C. If it's a C, change to T
|
||||
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
|
||||
double res1 = hmm.computeReadLikelihoodGivenHaplotype(
|
||||
originalHMM.initialize(mread.length, haplotype1.length);
|
||||
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
haplotype1, mread,
|
||||
quals, gop, gop,
|
||||
gcp);
|
||||
|
||||
gcp, 0, false);
|
||||
|
||||
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
|
||||
|
||||
|
|
@ -69,7 +69,33 @@ public class StandardCallerArgumentCollection {
|
|||
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2;
|
||||
|
||||
/**
|
||||
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
|
||||
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
|
||||
* will try to remove (N * contamination fraction) bases for each alternate allele.
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
|
||||
public double CONTAMINATION_PERCENTAGE = 0.0;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logExactCalls", doc="x", required=false)
|
||||
public File exactCallsLog = null;
|
||||
|
||||
|
||||
public StandardCallerArgumentCollection() { }
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) {
|
||||
this.alleles = SCAC.alleles;
|
||||
this.GenotypingMode = SCAC.GenotypingMode;
|
||||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
|
||||
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
|
||||
|
|
|
|||
|
|
@ -41,19 +41,20 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
private final boolean useAlleleFromVCF;
|
||||
|
||||
private final double[] likelihoodSums = new double[4];
|
||||
|
||||
private final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
|
||||
|
||||
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
|
||||
}
|
||||
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
|
|
@ -78,8 +79,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 )
|
||||
pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE);
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup( pileup );
|
||||
pileup = createBAQedPileup(pileup);
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error);
|
||||
|
|
@ -150,8 +153,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
|
||||
// create the genotypes; no-call everyone for now
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
for ( SampleGenotypeData sampleData : GLs ) {
|
||||
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
|
||||
|
|
@ -202,6 +203,42 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
return allelesToUse;
|
||||
}
|
||||
|
||||
public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) {
|
||||
// special case removal of all reads
|
||||
if ( contaminationPercentage >= 1.0 )
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
|
||||
|
||||
// start by stratifying the reads by the alleles they represent at this position
|
||||
for( final PileupElement pe : pileup ) {
|
||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||
if ( baseIndex != -1 )
|
||||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
|
||||
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
|
||||
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
|
||||
int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage);
|
||||
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
|
||||
@Override
|
||||
public int compare(PileupElement element1, PileupElement element2) {
|
||||
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
|
||||
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
|
||||
}
|
||||
});
|
||||
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
||||
if ( alleleList.size() > numReadsToRemove )
|
||||
elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove));
|
||||
}
|
||||
|
||||
// clean up pointers so memory can be garbage collected if needed
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i].clear();
|
||||
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
|
||||
}
|
||||
|
||||
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
|
||||
final List<PileupElement> BAQedElements = new ArrayList<PileupElement>();
|
||||
for( final PileupElement PE : pileup ) {
|
||||
|
|
@ -220,6 +257,22 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
|
||||
}
|
||||
|
||||
private List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove) {
|
||||
final int pileupSize = elements.size();
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( !itemsToRemove.get(i) )
|
||||
elementsToKeep.add(elements.get(i));
|
||||
}
|
||||
|
||||
return elementsToKeep;
|
||||
}
|
||||
|
||||
private static class SampleGenotypeData {
|
||||
|
||||
public final String name;
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
|
|
@ -65,6 +66,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false)
|
||||
public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false;
|
||||
|
||||
/**
|
||||
* The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
|
||||
*/
|
||||
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false)
|
||||
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL;
|
||||
|
||||
/**
|
||||
* The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base
|
||||
* is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value.
|
||||
|
|
@ -112,10 +119,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false)
|
||||
public int INDEL_HAPLOTYPE_SIZE = 80;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false)
|
||||
public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
|
||||
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
|
||||
|
|
@ -183,63 +186,41 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
|
||||
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public UnifiedArgumentCollection clone() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.AFmodel = AFmodel;
|
||||
uac.heterozygosity = heterozygosity;
|
||||
uac.PCR_error = PCR_error;
|
||||
uac.GenotypingMode = GenotypingMode;
|
||||
uac.OutputMode = OutputMode;
|
||||
uac.NO_SLOD = NO_SLOD;
|
||||
uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED;
|
||||
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
|
||||
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
|
||||
uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE;
|
||||
uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY;
|
||||
uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY;
|
||||
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;
|
||||
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
|
||||
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
|
||||
uac.alleles = alleles;
|
||||
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
|
||||
uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
|
||||
uac.referenceSampleRod = referenceSampleRod;
|
||||
uac.referenceSampleName = referenceSampleName;
|
||||
uac.samplePloidy = samplePloidy;
|
||||
uac.maxQualityScore = minQualityScore;
|
||||
uac.phredScaledPrior = phredScaledPrior;
|
||||
uac.minPower = minPower;
|
||||
uac.minReferenceDepth = minReferenceDepth;
|
||||
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
|
||||
uac.exactCallsLog = exactCallsLog;
|
||||
|
||||
// todo- arguments to remove
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION;
|
||||
return uac;
|
||||
}
|
||||
|
||||
public UnifiedArgumentCollection() { }
|
||||
|
||||
public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) {
|
||||
super();
|
||||
this.alleles = SCAC.alleles;
|
||||
this.GenotypingMode = SCAC.GenotypingMode;
|
||||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
public UnifiedArgumentCollection(final StandardCallerArgumentCollection SCAC) {
|
||||
super(SCAC);
|
||||
}
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) {
|
||||
this.GLmodel = uac.GLmodel;
|
||||
this.AFmodel = uac.AFmodel;
|
||||
this.PCR_error = uac.PCR_error;
|
||||
this.NO_SLOD = uac.NO_SLOD;
|
||||
this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED;
|
||||
this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE;
|
||||
this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION;
|
||||
this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE;
|
||||
this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY;
|
||||
this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY;
|
||||
this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY;
|
||||
this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO;
|
||||
this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE;
|
||||
this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL;
|
||||
this.referenceSampleRod = uac.referenceSampleRod;
|
||||
this.referenceSampleName = uac.referenceSampleName;
|
||||
this.samplePloidy = uac.samplePloidy;
|
||||
this.maxQualityScore = uac.minQualityScore;
|
||||
this.phredScaledPrior = uac.phredScaledPrior;
|
||||
this.minPower = uac.minPower;
|
||||
this.minReferenceDepth = uac.minReferenceDepth;
|
||||
this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO;
|
||||
this.pairHMM = uac.pairHMM;
|
||||
|
||||
// todo- arguments to remove
|
||||
this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,8 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.PairHMM;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -48,7 +51,6 @@ public class PairHMMIndelErrorModel {
|
|||
public static final int BASE_QUAL_THRESHOLD = 20;
|
||||
|
||||
private boolean DEBUG = false;
|
||||
private boolean bandedLikelihoods = false;
|
||||
|
||||
private static final int MAX_CACHED_QUAL = 127;
|
||||
|
||||
|
|
@ -67,6 +69,8 @@ public class PairHMMIndelErrorModel {
|
|||
private final byte[] GAP_OPEN_PROB_TABLE;
|
||||
private final byte[] GAP_CONT_PROB_TABLE;
|
||||
|
||||
private final PairHMM pairHMM;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
|
@ -85,15 +89,26 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
|
||||
public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) {
|
||||
public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
|
||||
this.DEBUG = deb;
|
||||
this.bandedLikelihoods = bandedLikelihoods;
|
||||
|
||||
switch (hmmType) {
|
||||
case EXACT:
|
||||
pairHMM = new ExactPairHMM();
|
||||
break;
|
||||
case ORIGINAL:
|
||||
pairHMM = new OriginalPairHMM();
|
||||
break;
|
||||
case CACHING:
|
||||
case LOGLESS_CACHING:
|
||||
default:
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT.");
|
||||
}
|
||||
|
||||
// fill gap penalty table, affine naive model:
|
||||
this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX];
|
||||
this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX];
|
||||
|
||||
|
||||
for (int i = 0; i < START_HRUN_GAP_IDX; i++) {
|
||||
GAP_OPEN_PROB_TABLE[i] = indelGOP;
|
||||
GAP_CONT_PROB_TABLE[i] = indelGCP;
|
||||
|
|
@ -190,7 +205,6 @@ public class PairHMMIndelErrorModel {
|
|||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
|
||||
final int[] readCounts) {
|
||||
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
|
||||
final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
|
||||
|
||||
int readIdx=0;
|
||||
for (PileupElement p: pileup) {
|
||||
|
|
@ -303,8 +317,6 @@ public class PairHMMIndelErrorModel {
|
|||
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
|
||||
int j=0;
|
||||
|
||||
// initialize path metric and traceback memories for likelihood computation
|
||||
double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null;
|
||||
byte[] previousHaplotypeSeen = null;
|
||||
final byte[] contextLogGapOpenProbabilities = new byte[readBases.length];
|
||||
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
|
||||
|
|
@ -341,14 +353,9 @@ public class PairHMMIndelErrorModel {
|
|||
final int X_METRIC_LENGTH = readBases.length+2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length+2;
|
||||
|
||||
if (matchMetricArray == null) {
|
||||
if (previousHaplotypeSeen == null) {
|
||||
//no need to reallocate arrays for each new haplotype, as length won't change
|
||||
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
|
||||
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
|
||||
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
|
||||
}
|
||||
|
||||
int startIndexInHaplotype = 0;
|
||||
|
|
@ -356,11 +363,10 @@ public class PairHMMIndelErrorModel {
|
|||
startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
|
||||
previousHaplotypeSeen = haplotypeBases.clone();
|
||||
|
||||
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals,
|
||||
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
|
||||
(read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities),
|
||||
(read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities),
|
||||
contextLogGapContinuationProbabilities,
|
||||
startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray);
|
||||
contextLogGapContinuationProbabilities, startIndexInHaplotype, false);
|
||||
|
||||
|
||||
if (DEBUG) {
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
|
|||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
public class Haplotype {
|
||||
|
|
@ -184,6 +185,21 @@ public class Haplotype {
|
|||
return new Haplotype(newHaplotypeBases);
|
||||
}
|
||||
|
||||
public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {
|
||||
@Override
|
||||
public int compare( final Haplotype hap1, final Haplotype hap2 ) {
|
||||
final byte[] arr1 = hap1.getBases();
|
||||
final byte[] arr2 = hap2.getBases();
|
||||
// compares byte arrays using lexical ordering
|
||||
final int len = Math.min(arr1.length, arr2.length);
|
||||
for( int iii = 0; iii < len; iii++ ) {
|
||||
final int cmp = arr1[iii] - arr2[iii];
|
||||
if (cmp != 0) { return cmp; }
|
||||
}
|
||||
return arr2.length - arr1.length;
|
||||
}
|
||||
}
|
||||
|
||||
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(final List<Allele> alleleList,
|
||||
final int startPos,
|
||||
final ReferenceContext ref,
|
||||
|
|
|
|||
|
|
@ -1,259 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
|
||||
* User: rpoplin
|
||||
* Date: 3/1/12
|
||||
*/
|
||||
|
||||
public class PairHMM {
|
||||
private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
|
||||
private static final byte DEFAULT_GOP = (byte) 45;
|
||||
private static final byte DEFAULT_GCP = (byte) 10;
|
||||
private static final double BANDING_TOLERANCE = 22.0;
|
||||
private static final int BANDING_CLUSTER_WINDOW = 12;
|
||||
private final boolean noBanded;
|
||||
|
||||
public PairHMM() {
|
||||
noBanded = false;
|
||||
}
|
||||
|
||||
public PairHMM( final boolean noBanded ) {
|
||||
this.noBanded = noBanded;
|
||||
}
|
||||
|
||||
|
||||
public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray,
|
||||
final int X_METRIC_LENGTH) {
|
||||
|
||||
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// the initial condition
|
||||
matchMetricArray[1][1] = 0.0; // Math.log10(1.0);
|
||||
|
||||
}
|
||||
|
||||
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
|
||||
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
|
||||
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
|
||||
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
|
||||
|
||||
return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
|
||||
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
|
||||
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
|
||||
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
|
||||
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
// ensure that all the qual scores have valid values
|
||||
for( int iii = 0; iii < readQuals.length; iii++ ) {
|
||||
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
|
||||
}
|
||||
|
||||
if( false ) {
|
||||
final ArrayList<Integer> workQueue = new ArrayList<Integer>(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step
|
||||
final ArrayList<Integer> workToBeAdded = new ArrayList<Integer>();
|
||||
final ArrayList<Double> calculatedValues = new ArrayList<Double>();
|
||||
final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1;
|
||||
workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype
|
||||
|
||||
for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over
|
||||
//Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order
|
||||
int el = 1;
|
||||
for( int work : workQueue ) {
|
||||
// choose the appropriate diagonal baseline location
|
||||
int iii = 0;
|
||||
int jjj = diag;
|
||||
if( diag > Y_METRIC_LENGTH ) {
|
||||
iii = diag - Y_METRIC_LENGTH;
|
||||
jjj = Y_METRIC_LENGTH;
|
||||
}
|
||||
// move to the starting work location along the diagonal
|
||||
iii += work;
|
||||
jjj -= work;
|
||||
while( iii >= X_METRIC_LENGTH || jjj <= 0 ) {
|
||||
iii--;
|
||||
jjj++;
|
||||
work--;
|
||||
}
|
||||
if( !detectClusteredStartLocations(workToBeAdded, work ) ) {
|
||||
workToBeAdded.add(work); // keep this thread going once it has started
|
||||
}
|
||||
|
||||
if( work >= el - 3 ) {
|
||||
// step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for( el = work; el < numDiags + 1; el++ ) {
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
|
||||
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
|
||||
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
|
||||
calculatedValues.add(bestMetric);
|
||||
if( bestMetric > maxElement ) {
|
||||
maxElement = bestMetric;
|
||||
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
|
||||
break;
|
||||
}
|
||||
if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix
|
||||
break;
|
||||
}
|
||||
if( --jjj <= 0 ) { // don't walk off the edge of the matrix
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// find a local maximum to start a new band in the work queue
|
||||
double localMaxElement = Double.NEGATIVE_INFINITY;
|
||||
int localMaxElementIndex = 0;
|
||||
for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) {
|
||||
final double bestMetric = calculatedValues.get(kkk);
|
||||
if( bestMetric > localMaxElement ) {
|
||||
localMaxElement = bestMetric;
|
||||
localMaxElementIndex = kkk;
|
||||
} else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum
|
||||
if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) {
|
||||
workToBeAdded.add( work + localMaxElementIndex );
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
calculatedValues.clear();
|
||||
|
||||
// reset iii and jjj to the appropriate diagonal baseline location
|
||||
iii = 0;
|
||||
jjj = diag;
|
||||
if( diag > Y_METRIC_LENGTH ) {
|
||||
iii = diag - Y_METRIC_LENGTH;
|
||||
jjj = Y_METRIC_LENGTH;
|
||||
}
|
||||
// move to the starting work location along the diagonal
|
||||
iii += work-1;
|
||||
jjj -= work-1;
|
||||
|
||||
// step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value
|
||||
for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) {
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
|
||||
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
|
||||
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
|
||||
if( bestMetric > maxElement ) {
|
||||
maxElement = bestMetric;
|
||||
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
workQueue.clear();
|
||||
workQueue.addAll(workToBeAdded);
|
||||
workToBeAdded.clear();
|
||||
}
|
||||
} else {
|
||||
// simple rectangular version of update loop, slow
|
||||
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
if( (iii == 1 && jjj == 1) ) { continue; }
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
|
||||
matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
|
||||
}
|
||||
|
||||
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
|
||||
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
|
||||
final int im1 = indI - 1;
|
||||
final int jm1 = indJ - 1;
|
||||
|
||||
// update the match array
|
||||
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
|
||||
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
|
||||
final byte x = readBases[im1-1];
|
||||
final byte y = haplotypeBases[jm1-1];
|
||||
final byte qual = readQuals[im1-1];
|
||||
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
|
||||
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
|
||||
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
|
||||
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0);
|
||||
|
||||
// update the X (insertion) array
|
||||
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
|
||||
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1);
|
||||
|
||||
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
|
||||
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
|
||||
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2);
|
||||
}
|
||||
|
||||
// private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other
|
||||
private boolean detectClusteredStartLocations( final ArrayList<Integer> list, int loc ) {
|
||||
for(int x : list) {
|
||||
if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -25,8 +25,11 @@
|
|||
package org.broadinstitute.sting.utils.codecs.hapmap;
|
||||
|
||||
import org.broad.tribble.AsciiFeatureCodec;
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.annotation.Strand;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
|
@ -116,4 +119,10 @@ public class RawHapMapCodec extends AsciiFeatureCodec<RawHapMapFeature> {
|
|||
}
|
||||
return headerLine;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException {
|
||||
final AsciiLineReader br = new AsciiLineReader(stream);
|
||||
return new FeatureCodecHeader(readHeader(br), br.getPosition());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,8 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf;
|
|||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
|
@ -119,7 +117,7 @@ public class VCFCodec extends AbstractVCFCodec {
|
|||
// empty set for passes filters
|
||||
List<String> fFields = new LinkedList<String>();
|
||||
// otherwise we have to parse and cache the value
|
||||
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
|
||||
if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) )
|
||||
fFields.add(filterString);
|
||||
else
|
||||
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,107 @@
|
|||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
|
||||
public class ExactPairHMM extends PairHMM {
|
||||
|
||||
@Override
|
||||
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
|
||||
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
|
||||
|
||||
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// the initial condition
|
||||
matchMetricArray[1][1] = 0.0; // Math.log10(1.0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
// ensure that all the qual scores have valid values
|
||||
for( int iii = 0; iii < readQuals.length; iii++ ) {
|
||||
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
|
||||
}
|
||||
|
||||
// simple rectangular version of update loop, slow
|
||||
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
if( (iii == 1 && jjj == 1) ) { continue; }
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
|
||||
matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]});
|
||||
}
|
||||
|
||||
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
|
||||
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
|
||||
final int im1 = indI - 1;
|
||||
final int jm1 = indJ - 1;
|
||||
|
||||
// update the match array
|
||||
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
|
||||
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
|
||||
final byte x = readBases[im1-1];
|
||||
final byte y = haplotypeBases[jm1-1];
|
||||
final byte qual = readQuals[im1-1];
|
||||
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
|
||||
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
|
||||
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
|
||||
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0});
|
||||
|
||||
// update the X (insertion) array
|
||||
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
|
||||
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1});
|
||||
|
||||
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
|
||||
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
|
||||
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2});
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
/**
|
||||
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
|
||||
* User: rpoplin
|
||||
* Date: 3/1/12
|
||||
*/
|
||||
|
||||
public class OriginalPairHMM extends ExactPairHMM {
|
||||
|
||||
@Override
|
||||
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
// ensure that all the qual scores have valid values
|
||||
for( int iii = 0; iii < readQuals.length; iii++ ) {
|
||||
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
|
||||
}
|
||||
|
||||
// simple rectangular version of update loop, slow
|
||||
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
if( (iii == 1 && jjj == 1) ) { continue; }
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
|
||||
matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
|
||||
}
|
||||
|
||||
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
|
||||
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
|
||||
final int im1 = indI - 1;
|
||||
final int jm1 = indJ - 1;
|
||||
|
||||
// update the match array
|
||||
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
|
||||
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
|
||||
final byte x = readBases[im1-1];
|
||||
final byte y = haplotypeBases[jm1-1];
|
||||
final byte qual = readQuals[im1-1];
|
||||
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
|
||||
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
|
||||
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
|
||||
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0);
|
||||
|
||||
// update the X (insertion) array
|
||||
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
|
||||
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1);
|
||||
|
||||
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
|
||||
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
|
||||
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
|
||||
public abstract class PairHMM {
|
||||
protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
|
||||
protected static final byte DEFAULT_GOP = (byte) 45;
|
||||
protected static final byte DEFAULT_GCP = (byte) 10;
|
||||
|
||||
public enum HMM_IMPLEMENTATION {
|
||||
/* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */
|
||||
EXACT,
|
||||
/* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */
|
||||
ORIGINAL,
|
||||
/* Optimized version of the PairHMM which caches per-read computations */
|
||||
CACHING,
|
||||
/* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */
|
||||
LOGLESS_CACHING
|
||||
}
|
||||
|
||||
protected double[][] matchMetricArray = null;
|
||||
protected double[][] XMetricArray = null;
|
||||
protected double[][] YMetricArray = null;
|
||||
|
||||
public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH );
|
||||
|
||||
@Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length",
|
||||
"readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"})
|
||||
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood
|
||||
public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues );
|
||||
}
|
||||
Loading…
Reference in New Issue