diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index c7c281943..c5aabc64d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -125,8 +125,8 @@ public class ContextCovariate implements StandardCovariate { */ private BitSet contextWith(byte[] bases, int offset, int contextSize) { BitSet result = null; - if (offset >= contextSize) { - String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); + if (offset - contextSize + 1 >= 0) { + String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1)); if (!context.contains("N")) result = BitSetUtils.bitSetFrom(context); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java old mode 100755 new mode 100644 index b3ea88d58..77e4cc8c7 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java @@ -76,7 +76,7 @@ public class Datum { final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - return Math.min(empiricalQual, (double) QualityUtils.MAX_QUAL_SCORE); + return Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } byte empiricalQualByte() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index cedff0a80..64dba0551 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -31,7 +31,6 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.collections.Pair; @@ -152,7 +151,7 @@ public class RecalDataManager { ArrayList optionalCovariatesToAdd = new ArrayList(); // initialize an empty array of optional covariates to create the first few tables for (Covariate covariate : requiredCovariates) { requiredCovariatesToAdd.add(covariate); - final Map recalTable = new HashMap(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively) + final Map recalTable = new HashMap(); // initializing a new recal table for each required covariate (cumulatively) final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index c71a00a3a..2dac90252 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -74,7 +74,7 @@ public class RecalDatum extends Datum { } public final void calcCombinedEmpiricalQuality() { - this.empiricalQuality = empiricalQualDouble(); // cache the value so we don't call log over and over again + this.empiricalQuality = empiricalQualDouble(); // cache the value so we don't call log over and over again } public final void calcEstimatedReportedQuality() { @@ -102,7 +102,7 @@ public class RecalDatum extends Datum { @Override public String toString() { - return String.format("%d,%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()), (byte) Math.floor(getEstimatedQReported())); + return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality())); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index d7174536e..b33c036a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -55,13 +55,10 @@ public class UnifiedArgumentCollection { @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) public Double PCR_error = DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.DEFAULT_PCR_ERROR_RATE; - /** - * Specifies how to determine the alternate allele to use for genotyping - */ - @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) + @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) + @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; /** @@ -147,11 +144,11 @@ public class UnifiedArgumentCollection { @Hidden @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty", required = false) - public double INDEL_GAP_CONTINUATION_PENALTY = 10.0; + public byte INDEL_GAP_CONTINUATION_PENALTY = 10; @Hidden @Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty", required = false) - public double INDEL_GAP_OPEN_PENALTY = 45.0; + public byte INDEL_GAP_OPEN_PENALTY = 45; @Hidden @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 9036e3a62..3cec931d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -116,6 +116,8 @@ import java.util.*; @ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) +// TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources: +// TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} ) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatibleWalker { @@ -155,6 +157,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif @Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false) protected PrintStream verboseWriter = null; + @Hidden @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false) protected PrintStream metricsWriter = null; @@ -347,14 +350,6 @@ public class UnifiedGenotyper extends LocusWalker, Unif } public void onTraversalDone(UGStatistics sum) { - logger.info(String.format("Visited bases %d", sum.nBasesVisited)); - logger.info(String.format("Callable bases %d", sum.nBasesCallable)); - logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently)); - logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll())); - logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll())); - logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable())); - logger.info(String.format("Actual calls made %d", sum.nCallsMade)); - if ( metricsWriter != null ) { metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited)); metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 890ed9e3d..bcb9ea591 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -31,7 +31,9 @@ import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,6 +43,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Map; public class PairHMMIndelErrorModel { @@ -60,12 +63,12 @@ public class PairHMMIndelErrorModel { private static final int START_HRUN_GAP_IDX = 4; private static final int MAX_HRUN_GAP_IDX = 20; - private static final double MIN_GAP_OPEN_PENALTY = 30.0; - private static final double MIN_GAP_CONT_PENALTY = 10.0; - private static final double GAP_PENALTY_HRUN_STEP = 1.0; // each increase in hrun decreases gap penalty by this. + private static final byte MIN_GAP_OPEN_PENALTY = 30; + private static final byte MIN_GAP_CONT_PENALTY = 10; + private static final byte GAP_PENALTY_HRUN_STEP = 1; // each increase in hrun decreases gap penalty by this. - private final double[] GAP_OPEN_PROB_TABLE; - private final double[] GAP_CONT_PROB_TABLE; + private final byte[] GAP_OPEN_PROB_TABLE; + private final byte[] GAP_CONT_PROB_TABLE; ///////////////////////////// // Private Member Variables @@ -86,42 +89,42 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { this.DEBUG = deb; this.bandedLikelihoods = bandedLikelihoods; // fill gap penalty table, affine naive model: - this.GAP_CONT_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; - this.GAP_OPEN_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; + this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; + this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - double gop = -indelGOP/10.0; - double gcp = -indelGCP/10.0; for (int i = 0; i < START_HRUN_GAP_IDX; i++) { - GAP_OPEN_PROB_TABLE[i] = gop; - GAP_CONT_PROB_TABLE[i] = gcp; + GAP_OPEN_PROB_TABLE[i] = indelGOP; + GAP_CONT_PROB_TABLE[i] = indelGCP; } double step = GAP_PENALTY_HRUN_STEP/10.0; - double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob - double maxGCP = -MIN_GAP_CONT_PENALTY/10.0; // phred to log prob + // initialize gop and gcp to their default values + byte gop = indelGOP; + byte gcp = indelGCP; + // all of the following is computed in QUal-space for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { - gop += step; - if (gop > maxGOP) - gop = maxGOP; + gop -= GAP_PENALTY_HRUN_STEP; + if (gop < MIN_GAP_OPEN_PENALTY) + gop = MIN_GAP_OPEN_PENALTY; - gcp += step; - if(gcp > maxGCP) - gcp = maxGCP; + gcp -= step; + if(gcp < MIN_GAP_CONT_PENALTY) + gcp = MIN_GAP_CONT_PENALTY; GAP_OPEN_PROB_TABLE[i] = gop; GAP_CONT_PROB_TABLE[i] = gcp; } } - static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) { + static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG // 001000012345000000 @@ -154,203 +157,9 @@ public class PairHMMIndelErrorModel { } - private void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, - double[] currentGOP, double[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { - if (indI > 0 && indJ > 0) { - final int im1 = indI -1; - final int jm1 = indJ - 1; - // update current point - final byte x = readBases[im1]; - final byte y = haplotypeBases[jm1]; - final byte qual = readQuals[im1] < 1 ? 1 : (readQuals[im1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[im1]); - - final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; - - matchMetricArray[indI][indJ] = pBaseRead + MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[im1][jm1], XMetricArray[im1][jm1], YMetricArray[im1][jm1]}); - - final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - - XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); - - // update Y array - final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); - } - } - - private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, - double[] currentGOP, double[] currentGCP, int indToStart, - double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { - - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; - - if (indToStart == 0) { - // default initialization for all arrays - - for (int i=0; i < X_METRIC_LENGTH; i++) { - Arrays.fill(matchMetricArray[i],Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[i],Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[i],Double.NEGATIVE_INFINITY); - } - - for (int i=1; i < X_METRIC_LENGTH; i++) { - //initialize first column - XMetricArray[i][0] = END_GAP_COST*(i); - } - - for (int j=1; j < Y_METRIC_LENGTH; j++) { - // initialize first row - YMetricArray[0][j] = END_GAP_COST*(j); - } - matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY; - XMetricArray[0][0]= YMetricArray[0][0] = 0; - } - - - if (bandedLikelihoods) { - final double DIAG_TOL = 20; // means that max - min element in diags have to be > this number for banding to take effect. - - final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH -1; - final int elemsInDiag = Math.min(X_METRIC_LENGTH, Y_METRIC_LENGTH); - - int idxWithMaxElement = 0; - - for (int diag=indToStart; diag < numDiags; diag++) { - // compute default I and J start positions at edge of diagonals - int indI = 0; - int indJ = diag; - if (diag >= Y_METRIC_LENGTH ) { - indI = diag-(Y_METRIC_LENGTH-1); - indJ = Y_METRIC_LENGTH-1; - } - - // first pass: from max element to edge - int idxLow = idxWithMaxElement; - - // reset diag max value before starting - double maxElementInDiag = Double.NEGATIVE_INFINITY; - // set indI, indJ to correct values - indI += idxLow; - indJ -= idxLow; - if (indI >= X_METRIC_LENGTH || indJ < 0) { - idxLow--; - indI--; - indJ++; - } - - - for (int el = idxLow; el < elemsInDiag; el++) { - updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, - currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); - // update max in diagonal - final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); - - // check if we've fallen off diagonal value by threshold - if (bestMetric > maxElementInDiag) { - maxElementInDiag = bestMetric; - idxWithMaxElement = el; - } - else if (bestMetric < maxElementInDiag - DIAG_TOL && idxWithMaxElement > 0) - break; // done w/current diagonal - - indI++; - if (indI >=X_METRIC_LENGTH ) - break; - indJ--; - if (indJ <= 0) - break; - } - if (idxLow > 0) { - // now do second part in opposite direction - indI = 0; - indJ = diag; - if (diag >= Y_METRIC_LENGTH ) { - indI = diag-(Y_METRIC_LENGTH-1); - indJ = Y_METRIC_LENGTH-1; - } - - indI += idxLow-1; - indJ -= idxLow-1; - for (int el = idxLow-1; el >= 0; el--) { - - updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, - currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); - // update max in diagonal - final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); - - // check if we've fallen off diagonal value by threshold - if (bestMetric > maxElementInDiag) { - maxElementInDiag = bestMetric; - idxWithMaxElement = el; - } - else if (bestMetric < maxElementInDiag - DIAG_TOL) - break; // done w/current diagonal - - indJ++; - if (indJ >= Y_METRIC_LENGTH ) - break; - indI--; - if (indI <= 0) - break; - } - } - // if (DEBUG) - // System.out.format("Max:%4.1f el:%d\n",maxElementInDiag, idxWithMaxElement); - } - } - else { - // simplified rectangular version of update loop - for (int indI=1; indI < X_METRIC_LENGTH; indI++) { - for (int indJ=indToStart+1; indJ < Y_METRIC_LENGTH; indJ++) { - updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, - currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); - - } - } - } - - - - final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1; - final double bestMetric = MathUtils.approximateLog10SumLog10(new double[]{ matchMetricArray[bestI][bestJ], XMetricArray[bestI][bestJ], YMetricArray[bestI][bestJ] }); - - /* - if (DEBUG) { - PrintStream outx, outy, outm, outs; - double[][] sumMetrics = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - try { - outx = new PrintStream("datax.txt"); - outy = new PrintStream("datay.txt"); - outm = new PrintStream("datam.txt"); - outs = new PrintStream("datas.txt"); - double metrics[] = new double[3]; - for (int indI=0; indI < X_METRIC_LENGTH; indI++) { - for (int indJ=0; indJ < Y_METRIC_LENGTH; indJ++) { - metrics[0] = matchMetricArray[indI][indJ]; - metrics[1] = XMetricArray[indI][indJ]; - metrics[2] = YMetricArray[indI][indJ]; - //sumMetrics[indI][indJ] = MathUtils.softMax(metrics); - outx.format("%4.1f ", metrics[1]); - outy.format("%4.1f ", metrics[2]); - outm.format("%4.1f ", metrics[0]); - outs.format("%4.1f ", MathUtils.softMax(metrics)); - } - outx.println(); outm.println();outy.println(); outs.println(); - } - outm.close(); outx.close(); outy.close(); - } catch (java.io.IOException e) { throw new UserException("bla");} - } - */ - - return bestMetric; - - } - - private void fillGapProbabilities(int[] hrunProfile, - double[] contextLogGapOpenProbabilities, double[] contextLogGapContinuationProbabilities) { + private void fillGapProbabilities(final int[] hrunProfile, + final byte[] contextLogGapOpenProbabilities, + final byte[] contextLogGapContinuationProbabilities) { // fill based on lookup table for (int i = 0; i < hrunProfile.length; i++) { if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) { @@ -372,27 +181,8 @@ public class PairHMMIndelErrorModel { final int readCounts[] = new int[pileup.getNumberOfElements()]; int readIdx=0; - LinkedHashMap gapOpenProbabilityMap = new LinkedHashMap(); - LinkedHashMap gapContProbabilityMap = new LinkedHashMap(); - - // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes. - // todo -- refactor into separate function - for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - byte[] haplotypeBases = haplotype.getBases(); - double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length]; - double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length]; - - // get homopolymer length profile for current haplotype - int[] hrunProfile = new int[haplotypeBases.length]; - getContextHomopolymerLength(haplotypeBases,hrunProfile); - fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - - gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities); - gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities); - - } + PairHMM pairHMM = new PairHMM(bandedLikelihoods); for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations readCounts[readIdx] = p.getRepresentativeCount(); @@ -406,14 +196,32 @@ public class PairHMMIndelErrorModel { } } else { + if (DEBUG) { + System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); + } // System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read.isEmpty()) continue; - if(ReadUtils.is454Read(read)) { + if (read.getUnclippedEnd() > ref.getWindow().getStop()) + read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop()); + + if (read.isEmpty()) continue; - } + + if (read.getUnclippedStart() < ref.getWindow().getStart()) + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart()); + + if (read.isEmpty()) + continue; + // hard-clip low quality ends - this may introduce extra H elements in CIGAR string + read = ReadClipper.hardClipLowQualEnds(read,(byte)BASE_QUAL_THRESHOLD ); + + if (read.isEmpty()) + continue; + // get bases of candidate haplotypes that overlap with reads final int trailingBases = 3; @@ -469,54 +277,56 @@ public class PairHMMIndelErrorModel { unclippedReadBases = read.getReadBases(); unclippedReadQuals = read.getBaseQualities(); - // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, - // and may leave a string of Q2 bases still hanging off the reads. - for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) { - if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) - numStartClippedBases++; - else - break; + final int extraOffset = Math.abs(eventLength); - } - for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){ - if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) - numEndClippedBases++; - else - break; - } + /** + * Compute genomic locations that candidate haplotypes will span. + * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord, + * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above. + * We will propose haplotypes that overlap the read with some padding. + * True read start = readStart + numStartClippedBases - ReadUtils.getFirstInsertionOffset(read) + * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart. + * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to + * differentiate context between two haplotypes + */ + long startLocationInRefForHaplotypes = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); + long stopLocationInRefForHaplotypes = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; - int extraOffset = Math.abs(eventLength); + if (DEBUG) + System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); - long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); - long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; - - // Variables start and stop are coordinates (inclusive) where we want to get the haplotype from. int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; // check if start of read will be before start of reference context - if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut - start = ref.getWindow().getStart(); - + if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { + // read starts before haplotype: read will have to be cut + //numStartClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; + startLocationInRefForHaplotypes = ref.getWindow().getStart(); + } // check also if end of read will go beyond reference context - if (stop > ref.getWindow().getStop()) - stop = ref.getWindow().getStop(); + if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { + //numEndClippedBases += stopLocationInRefForHaplotypes - ref.getWindow().getStop(); + stopLocationInRefForHaplotypes = ref.getWindow().getStop(); + } - // if there's an insertion in the read, the read stop position will be less than start + read length, + // if there's an insertion in the read, the read stop position will be less than start + read legnth, // but we want to compute likelihoods in the whole region that a read might overlap - if (stop <= start + readLength) { - stop = start + readLength-1; + if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) { + stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; } // ok, we now figured out total number of clipped bases on both ends. // Figure out where we want to place the haplotype to score read against - /* - if (DEBUG) - System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); - */ + + if (DEBUG) + System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); LinkedHashMap readEl = new LinkedHashMap(); + /** + * Check if we'll end up with an empty read once all clipping is done + */ if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) { int j=0; for (Allele a: haplotypeMap.keySet()) { @@ -537,69 +347,67 @@ public class PairHMMIndelErrorModel { // initialize path metric and traceback memories for likelihood computation double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; - double[] previousGOP = null; - double[] previousGCP = null; - int startIdx; + final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; + final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; + + // get homopolymer length profile for current haplotype + int[] hrunProfile = new int[readBases.length]; + getContextHomopolymerLength(readBases,hrunProfile); + fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); + + for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - if (stop > haplotype.getStopPosition()) - stop = haplotype.getStopPosition(); - if (start < haplotype.getStartPosition()) - start = haplotype.getStartPosition(); + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); - // cut haplotype bases - long indStart = start - haplotype.getStartPosition(); - long indStop = stop - haplotype.getStartPosition(); + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); double readLikelihood; if (DEBUG) System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength(), read.getCigar().toString()); + indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); + + - if (indStart < 0 || indStop >= haplotype.getBases().length || indStart > indStop) { - // read spanned more than allowed reference context: we currently can't deal with this - readLikelihood =0; - } else - { final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - if (matchMetricArray == null) { - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; + final int X_METRIC_LENGTH = readBases.length+2; + final int Y_METRIC_LENGTH = haplotypeBases.length+2; + if (matchMetricArray == null) { + //no need to reallocate arrays for each new haplotype, as length won't change matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); } - final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); - final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); - if (previousHaplotypeSeen == null) - startIdx = 0; - else { - final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); - final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP); - final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP); - startIdx = Math.min(Math.min(s1, s2), s3); - } + + int startIndexInHaplotype = 0; + if (previousHaplotypeSeen != null) + startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); previousHaplotypeSeen = haplotypeBases.clone(); - previousGOP = currentContextGOP.clone(); - previousGCP = currentContextGCP.clone(); + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + contextLogGapOpenProbabilities, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities, + startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, - currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray); if (DEBUG) { System.out.println("H:"+new String(haplotypeBases)); System.out.println("R:"+new String(readBases)); System.out.format("L:%4.2f\n",readLikelihood); - System.out.format("StPos:%d\n", startIdx); + System.out.format("StPos:%d\n", startIndexInHaplotype); } - } readEl.put(a,readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 072962436..7a3b85567 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -20,10 +20,8 @@ import java.util.*; public class AlleleCount extends VariantStratifier { @Override public void initialize() { - List> evals = getVariantEvalWalker().getEvals(); - // we can only work with a single eval VCF, and it must have genotypes - if ( evals.size() != 1 ) + if ( getVariantEvalWalker().getEvals().size() != 1 && !getVariantEvalWalker().mergeEvals ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf"); // There are 2 x n sample chromosomes for diploids diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e2d1692d0..3778cffb8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -241,14 +241,6 @@ public class VariantDataManager { value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - if (vc.isIndel() && annotationKey.equalsIgnoreCase("QD")) { - // normalize QD by event length for indel case - int eventLength = Math.abs(vc.getAlternateAllele(0).getBaseString().length() - vc.getReference().getBaseString().length()); // ignore multi-allelic complication here for now - if (eventLength > 0) { // sanity check - value /= (double)eventLength; - } - } - if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } } catch( Exception e ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java new file mode 100644 index 000000000..9fcb97a4d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.*; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class PairHMM { + private static final int MAX_CACHED_QUAL = (int)Byte.MAX_VALUE; + private static final byte DEFAULT_GOP = (byte) 45; + private static final byte DEFAULT_GCP = (byte) 10; + private static final double BANDING_TOLERANCE = 22.0; + private static final int BANDING_CLUSTER_WINDOW = 12; + private final boolean noBanded; + + public PairHMM() { + noBanded = false; + } + + public PairHMM( final boolean noBanded ) { + this.noBanded = noBanded; + } + + + public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, + final int X_METRIC_LENGTH) { + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + + } + + @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability + public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, + final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // initial arrays to hold the probabilities of being in the match, insertion and deletion cases + final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + + return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); + } + + @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability + public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, + final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + if( false ) { + final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step + final ArrayList workToBeAdded = new ArrayList(); + final ArrayList calculatedValues = new ArrayList(); + final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; + workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype + + for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over + //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order + int el = 1; + for( int work : workQueue ) { + // choose the appropriate diagonal baseline location + int iii = 0; + int jjj = diag; + if( diag > Y_METRIC_LENGTH ) { + iii = diag - Y_METRIC_LENGTH; + jjj = Y_METRIC_LENGTH; + } + // move to the starting work location along the diagonal + iii += work; + jjj -= work; + while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { + iii--; + jjj++; + work--; + } + if( !detectClusteredStartLocations(workToBeAdded, work ) ) { + workToBeAdded.add(work); // keep this thread going once it has started + } + + if( work >= el - 3 ) { + // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value + double maxElement = Double.NEGATIVE_INFINITY; + for( el = work; el < numDiags + 1; el++ ) { + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, + insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); + final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); + calculatedValues.add(bestMetric); + if( bestMetric > maxElement ) { + maxElement = bestMetric; + } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { + break; + } + if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix + break; + } + if( --jjj <= 0 ) { // don't walk off the edge of the matrix + break; + } + } + + // find a local maximum to start a new band in the work queue + double localMaxElement = Double.NEGATIVE_INFINITY; + int localMaxElementIndex = 0; + for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { + final double bestMetric = calculatedValues.get(kkk); + if( bestMetric > localMaxElement ) { + localMaxElement = bestMetric; + localMaxElementIndex = kkk; + } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum + if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { + workToBeAdded.add( work + localMaxElementIndex ); + } + break; + } + } + calculatedValues.clear(); + + // reset iii and jjj to the appropriate diagonal baseline location + iii = 0; + jjj = diag; + if( diag > Y_METRIC_LENGTH ) { + iii = diag - Y_METRIC_LENGTH; + jjj = Y_METRIC_LENGTH; + } + // move to the starting work location along the diagonal + iii += work-1; + jjj -= work-1; + + // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value + for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, + insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); + final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); + if( bestMetric > maxElement ) { + maxElement = bestMetric; + } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { + break; + } + } + } + } + workQueue.clear(); + workQueue.addAll(workToBeAdded); + workToBeAdded.clear(); + } + } else { + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } + + // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other + private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { + for(int x : list) { + if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { + return true; + } + } + return false; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index b5aa2598e..f53b439da 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,6 +9,7 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { + public final static byte MAX_RECALIBRATED_Q_SCORE = 50; public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 70eb9426b..d85fb03cd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -68,9 +68,9 @@ public class BaseRecalibration { /** * This constructor only exists for testing purposes. * - * @param quantizationInfo - * @param keysAndTablesMap - * @param requestedCovariates + * @param quantizationInfo the quantization info object + * @param keysAndTablesMap the map of key managers and recalibration tables + * @param requestedCovariates the list of requested covariates */ protected BaseRecalibration(QuantizationInfo quantizationInfo, LinkedHashMap> keysAndTablesMap, ArrayList requestedCovariates) { this.quantizationInfo = quantizationInfo; @@ -179,9 +179,8 @@ public class BaseRecalibration { } double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula - recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_QUAL_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL + recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL - return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java index 2b4cb2ae3..4b384aac0 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -39,8 +39,8 @@ public class ContextCovariateUnitTest { private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) { for (int i = 0; i < values.length; i++) { String expectedContext = null; - if (i >= contextSize) { - String context = bases.substring(i - contextSize, i); + if (i - contextSize + 1 >= 0) { + String context = bases.substring(i - contextSize + 1, i + 1); if (!context.contains("N")) expectedContext = context; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 4d00f6113..067e9088c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("d3191b2f10139c969501990ffdf29082")); + Arrays.asList("9b08dc6800ba11bc6d9f6ccf392a60fe")); executeTest("test MultiSample Pilot1", spec); } @@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("7c7288170c6aadae555a44e79ca5bf19")); + Arrays.asList("d275e0f75368dbff012ea8655dce3444")); executeTest("test SingleSample Pilot2", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("c956f0ea0e5f002288a09f4bc4af1319")); + Arrays.asList("e948543b83bfd0640fcb994d72f8e234")); executeTest("test Multiple SNP alleles", spec); } @@ -80,7 +80,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "2158eb918abb95225ea5372fcd9c9236"; + private final static String COMPRESSED_OUTPUT_MD5 = "1e3c897794e5763a8720807686707b18"; @Test public void testCompressedOutput() { @@ -101,7 +101,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "834e85f6af4ad4a143b913dfc7defb08"; + String md5 = "06d11ed89f02f08911e100df0f7db7a4"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -200,8 +200,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "d5879f1c277035060434d79a441b31ca" ); - e.put( 1.0 / 1850, "13f80245bab2321b92d27eebd5c2fc33" ); + e.put( 0.01, "d07e5ca757fbcb1c03f652f82265c2f8" ); + e.put( 1.0 / 1850, "d1fb9186e6f39f2bcf5d0edacd8f7fe2" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -225,7 +225,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("8c134a6e0abcc70d2ed3216d5f8e0100")); + Arrays.asList("623be1fd8b63a01bfe35ac864d5199fe")); executeTest(String.format("test multiple technologies"), spec); } @@ -244,7 +244,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("34baad3177712f6cd0b476f4c578e08f")); + Arrays.asList("40ea10c0238c3be2991d31ae72476884")); executeTest(String.format("test calling with BAQ"), spec); } @@ -263,7 +263,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("4bf4f819a39a73707cae60fe30478742")); + Arrays.asList("c9b0bd900a4ec949adfbd28909581eeb")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -278,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("ae08fbd6b0618cf3ac1be763ed7b41ca")); + Arrays.asList("6b7c8691c527facf9884c2517d943f2f")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -291,7 +291,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("120600f2bfa3a47bd93b50f768f98d5b")); + Arrays.asList("d72603aa33a086d64d4dddfd2995552f")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -301,7 +301,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("2e75d2766235eab23091a67ea2947d13")); + Arrays.asList("4a59fe207949b7d043481d7c1b786573")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -311,7 +311,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("5057bd7d07111e8b1085064782eb6c80")); + Arrays.asList("a8a9ccf30bddee94bb1d300600794ee7")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -319,13 +319,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("c0f9ca3ceab90ebd38cc0eec9441d71f")); + Arrays.asList("0b388936022539530f565da14d5496d3")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("0240f34e71f137518be233c9890a5349")); + Arrays.asList("537dd9b4174dc356fb13d8d3098ad602")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -368,7 +368,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("53758e66e3a3188bd9c78d2329d41962")); + Arrays.asList("973178b97efd2daacc9e45c414275d59")); executeTest("test minIndelFraction 0.0", spec); } @@ -376,7 +376,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("3aa39b1f6f3b1eb051765f9c21f6f461")); + Arrays.asList("220facd2eb0923515d1d8ab874055564")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 1ab7b679e..71c014f2c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -34,6 +34,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf"; + private static String fundamentalTestSNPsSplit1of2VCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.split_1_of_2.vcf"; + private static String fundamentalTestSNPsSplit2of2VCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.split_2_of_2.vcf"; private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.NA12045.vcf"; private static String cmdRoot = "-T VariantEval" + @@ -437,24 +439,69 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testAlleleCountStrat() { WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T VariantEval", - "-R " + b37KGReference, - "--dbsnp " + b37dbSNP132, - "--eval " + fundamentalTestSNPsVCF, - "-noEV", - "-EV CountVariants", - "-noST", - "-ST AlleleCount", - "-L " + fundamentalTestSNPsVCF, - "-o %s" - ), - 1, - Arrays.asList("1198bfea6183bd43219071a84c79a386") - ); + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsVCF, + "-noEV", + "-EV CountVariants", + "-noST", + "-ST AlleleCount", + "-L " + fundamentalTestSNPsVCF, + "-o %s" + ), + 1, + Arrays.asList("1198bfea6183bd43219071a84c79a386") + ); executeTest("testAlleleCountStrat", spec); } + @Test + public void testMultipleEvalTracksAlleleCountWithMerge() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsSplit1of2VCF, + "--eval " + fundamentalTestSNPsSplit2of2VCF, + "--mergeEvals", + "-noEV", + "-EV CountVariants", + "-noST", + "-ST AlleleCount", + "-L " + fundamentalTestSNPsVCF, + "-o %s" + ), + 1, + Arrays.asList("1198bfea6183bd43219071a84c79a386") + ); + executeTest("testMultipleEvalTracksAlleleCountWithMerge", spec); + } + + @Test + public void testMultipleEvalTracksAlleleCountWithoutMerge() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsSplit1of2VCF, + "--eval " + fundamentalTestSNPsSplit2of2VCF, + //"--mergeEvals", No merge with AC strat ==> error + "-noEV", + "-EV CountVariants", + "-noST", + "-ST AlleleCount", + "-L " + fundamentalTestSNPsVCF + ), + 0, + UserException.class + ); + executeTest("testMultipleEvalTracksAlleleCountWithoutMerge", spec); + } + @Test public void testIntervalStrat() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 11e093a6c..879a5bfa3 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -73,9 +73,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { } VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf", - "6d7ee4cb651c8b666e4a4523363caaff", // tranches - "ee5b408c8434a594496118875690c438", // recal file - "5d7e07d8813db96ba3f3dfe4737f83d1"); // cut VCF + "da4458d05f6396f5c4ab96f274e5ccdc", // tranches + "cf380d9b0ae04c8918be8425f82035b4", // recal file + "b00e5e5a6807df8ed1682317948e8a6d"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createData2() { diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java new file mode 100644 index 000000000..22bcb1bbf --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class PairHMMUnitTest extends BaseTest { + final static boolean EXTENSIVE_TESTING = true; + PairHMM hmm = new PairHMM( false ); // reference implementation + PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private class BasicLikelihoodTestProvider extends TestDataProvider { + final String ref, read; + final byte[] refBasesWithContext, readBasesWithContext; + final int baseQual, insQual, delQual, gcp; + final int expectedQual; + final static String CONTEXT = "ACGTAATGACGATTGCA"; + final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; + final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; + + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + } + + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + super(BasicLikelihoodTestProvider.class, String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); + this.baseQual = baseQual; + this.delQual = delQual; + this.insQual = insQual; + this.gcp = gcp; + this.read = read; + this.ref = ref; + this.expectedQual = expectedQual; + + refBasesWithContext = asBytes(ref, left, right); + readBasesWithContext = asBytes(read, false, false); + } + + public double expectedLogL() { + return expectedQual / -10.0; + } + + public double tolerance() { + return 0.1; // TODO FIXME arbitrary + } + + public double calcLogL() { + + double logL = hmm.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), + qualAsBytes(gcp, false)); + + return logL; + } + + private final byte[] asBytes(final String bases, final boolean left, final boolean right) { + return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + } + + private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { + final byte phredQuals[] = new byte[readBasesWithContext.length]; + // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM + Arrays.fill(phredQuals, (byte)100); + + // update just the bases corresponding to the provided micro read with the quality scores + if( doGOP ) { + phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + } else { + for ( int i = 0; i < read.length(); i++) + phredQuals[i + CONTEXT.length()] = (byte)phredQual; + } + + return phredQuals; + } + } + + final Random random = new Random(87865573); + private class BandedLikelihoodTestProvider extends TestDataProvider { + final String ref, read; + final byte[] refBasesWithContext, readBasesWithContext; + final int baseQual, insQual, delQual, gcp; + final int expectedQual; + final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC"; + final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA"; + final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT"; + final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA"; + final byte[] baseQuals, insQuals, delQuals, gcps; + + public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + } + + public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); + this.baseQual = baseQual; + this.delQual = delQual; + this.insQual = insQual; + this.gcp = gcp; + this.read = read; + this.ref = ref; + this.expectedQual = expectedQual; + + refBasesWithContext = asBytes(ref, left, right); + readBasesWithContext = asBytes(read, false, false); + baseQuals = qualAsBytes(baseQual); + insQuals = qualAsBytes(insQual); + delQuals = qualAsBytes(delQual); + gcps = qualAsBytes(gcp, false); + } + + public double expectedLogL() { + double logL = hmm.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + baseQuals, insQuals, delQuals, gcps); + + return logL; + } + + public double tolerance() { + return 0.2; // TODO FIXME arbitrary + } + + public double calcLogL() { + + double logL = bandedHMM.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + baseQuals, insQuals, delQuals, gcps); + + return logL; + } + + private final byte[] asBytes(final String bases, final boolean left, final boolean right) { + return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + } + + private byte[] qualAsBytes(final int phredQual) { + return qualAsBytes(phredQual, true); + } + + private byte[] qualAsBytes(final int phredQual, final boolean addRandom) { + final byte phredQuals[] = new byte[readBasesWithContext.length]; + Arrays.fill(phredQuals, (byte)phredQual); + if(addRandom) { + for( int iii = 0; iii < phredQuals.length; iii++) { + phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3)); + } + } + return phredQuals; + } + } + + @DataProvider(name = "BasicLikelihoodTestProvider") + public Object[][] makeBasicLikelihoodTests() { + // context on either side is ACGTTGCA REF ACGTTGCA + // test all combinations + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + + for ( final int baseQual : baseQuals ) { + for ( final int indelQual : indelQuals ) { + for ( final int gcp : gcps ) { + + // test substitutions + for ( final byte refBase : BaseUtils.BASES ) { + for ( final byte readBase : BaseUtils.BASES ) { + final String ref = new String(new byte[]{refBase}); + final String read = new String(new byte[]{readBase}); + final int expected = refBase == readBase ? 0 : baseQual; + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + } + } + + // test insertions and deletions + for ( final int size : sizes ) { + for ( final byte base : BaseUtils.BASES ) { + final int expected = indelQual + (size - 2) * gcp; + + for ( boolean insertionP : Arrays.asList(true, false)) { + final String small = Utils.dupString((char)base, 1); + final String big = Utils.dupString((char)base, size); + + final String ref = insertionP ? small : big; + final String read = insertionP ? big : small; + + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + } + } + } + } + } + } + + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); + } + + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { + double calculatedLogL = cfg.calcLogL(); + double expectedLogL = cfg.expectedLogL(); + logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + } + + @DataProvider(name = "BandedLikelihoodTestProvider") + public Object[][] makeBandedLikelihoodTests() { + // context on either side is ACGTTGCA REF ACGTTGCA + // test all combinations + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + + for ( final int baseQual : baseQuals ) { + for ( final int indelQual : indelQuals ) { + for ( final int gcp : gcps ) { + + // test substitutions + for ( final byte refBase : BaseUtils.BASES ) { + for ( final byte readBase : BaseUtils.BASES ) { + final String ref = new String(new byte[]{refBase}); + final String read = new String(new byte[]{readBase}); + final int expected = refBase == readBase ? 0 : baseQual; + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + } + } + + // test insertions and deletions + for ( final int size : sizes ) { + for ( final byte base : BaseUtils.BASES ) { + final int expected = indelQual + (size - 2) * gcp; + + for ( boolean insertionP : Arrays.asList(true, false)) { + final String small = Utils.dupString((char)base, 1); + final String big = Utils.dupString((char)base, size); + + final String ref = insertionP ? small : big; + final String read = insertionP ? big : small; + + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + } + } + } + } + } + } + + return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); + } + + @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) + public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { + double calculatedLogL = cfg.calcLogL(); + double expectedLogL = cfg.expectedLogL(); + logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + } + + @Test + public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() { + byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); + + final int offset = 2; + byte[] gop = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(gop, (byte) 80); + byte[] gcp = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(gcp, (byte) 80); + + for( int k = 0; k < haplotype1.length - 2 * offset; k++ ) { + byte[] quals = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(quals, (byte) 90); + // one read mismatches the haplotype + quals[k] = 20; + + byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); + // change single base at position k to C. If it's a C, change to T + mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); + double res1 = hmm.computeReadLikelihoodGivenHaplotype( + haplotype1, mread, + quals, gop, gop, + gcp); + + + System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); + + Assert.assertEquals(res1, -2.0, 1e-2); + } + } + + @Test + public void testMismatchInEveryPositionInTheRead() { + byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); + + final int offset = 2; + byte[] gop = new byte[haplotype1.length - offset]; + Arrays.fill(gop, (byte) 80); + byte[] gcp = new byte[haplotype1.length - offset]; + Arrays.fill(gcp, (byte) 80); + + for( int k = 0; k < haplotype1.length - offset; k++ ) { + byte[] quals = new byte[haplotype1.length - offset]; + Arrays.fill(quals, (byte) 90); + // one read mismatches the haplotype + quals[k] = 20; + + byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); + // change single base at position k to C. If it's a C, change to T + mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); + double res1 = hmm.computeReadLikelihoodGivenHaplotype( + haplotype1, mread, + quals, gop, gop, + gcp); + + + System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); + + Assert.assertEquals(res1, -2.0, 1e-2); + } + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java index 4f0d39991..1193b0aea 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java @@ -24,10 +24,6 @@ public class BaseRecalibrationUnitTest { private org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager dataManager; private LinkedHashMap> keysAndTablesMap; - private BQSRKeyManager rgKeyManager; - private BQSRKeyManager qsKeyManager; - private BQSRKeyManager cvKeyManager; - private ReadGroupCovariate rgCovariate; private QualityScoreCovariate qsCovariate; private ContextCovariate cxCovariate; @@ -60,13 +56,13 @@ public class BaseRecalibrationUnitTest { rgCovariate = new ReadGroupCovariate(); rgCovariate.initialize(RAC); requiredCovariates.add(rgCovariate); - rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); keysAndTablesMap.put(rgKeyManager, new HashMap()); qsCovariate = new QualityScoreCovariate(); qsCovariate.initialize(RAC); requiredCovariates.add(qsCovariate); - qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); keysAndTablesMap.put(qsKeyManager, new HashMap()); cxCovariate = new ContextCovariate(); @@ -75,7 +71,7 @@ public class BaseRecalibrationUnitTest { cyCovariate = new CycleCovariate(); cyCovariate.initialize(RAC); optionalCovariates.add(cyCovariate); - cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); keysAndTablesMap.put(cvKeyManager, new HashMap()); @@ -108,7 +104,7 @@ public class BaseRecalibrationUnitTest { updateCovariateWithKeySet(mapEntry.getValue(), key, newDatum); } } - dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_QUAL_SCORE); + dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_RECALIBRATED_Q_SCORE); List quantizedQuals = new ArrayList(); List qualCounts = new ArrayList(); @@ -179,7 +175,7 @@ public class BaseRecalibrationUnitTest { BitSet key = entry.getKey(); RecalDatum datum = entry.getValue(); List keySet = keyManager.keySetFrom(key); - System.out.println(String.format("%s => %s", Utils.join(",", keySet), datum)); + System.out.println(String.format("%s => %s", Utils.join(",", keySet), datum) + "," + datum.getEstimatedQReported()); } System.out.println(); } @@ -187,9 +183,9 @@ public class BaseRecalibrationUnitTest { } - private static void printNestedHashMap(Map table, String output) { + private static void printNestedHashMap(Map table, String output) { for (Object key : table.keySet()) { - String ret = ""; + String ret; if (output.isEmpty()) ret = "" + key; else @@ -199,7 +195,7 @@ public class BaseRecalibrationUnitTest { if (next instanceof org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) System.out.println(ret + " => " + next); else - printNestedHashMap((Map) next, "" + ret); + printNestedHashMap((Map) next, "" + ret); } } @@ -269,7 +265,7 @@ public class BaseRecalibrationUnitTest { } final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_QUAL_SCORE); + return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // Verbose printouts used to validate with old recalibrator //if(key.contains(null)) { @@ -289,6 +285,6 @@ public class BaseRecalibrationUnitTest { final double doubleMismatches = (double) (errors + smoothing); final double doubleObservations = (double) ( observations + smoothing ); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - return Math.min(QualityUtils.MAX_QUAL_SCORE, empiricalQual); + return Math.min(QualityUtils.MAX_RECALIBRATED_Q_SCORE, empiricalQual); } }