From f9f8589692fece0185a7e8e059b75ee4672d1c8d Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 11 Apr 2012 13:56:51 -0400 Subject: [PATCH 01/17] Refactoring/fixing up UG HMM code: a) Make code use PairHMM class instead of having duplicated code. That way UG and HaplotypeCaller now use same core code. Changes to be able to do this: 1. Compute context-dependent GOP as a function of read, not of haplotype, b) Extracted code to initialize HMM arrays into separate method, c) Move PairHMM class and unit test to public, d) Reenable banded code in PairHMM, inverted sense of flag (true=enable feature) but leave off in HaplotypeCaller. --- .../genotyper/UnifiedArgumentCollection.java | 4 +- .../indels/PairHMMIndelErrorModel.java | 250 +++++++------- .../broadinstitute/sting/utils/PairHMM.java | 255 +++++++++++++++ .../sting/utils/PairHMMUnitTest.java | 305 ++++++++++++++++++ 4 files changed, 695 insertions(+), 119 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/PairHMM.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 9f606cdfb..93f5c0a43 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -137,11 +137,11 @@ public class UnifiedArgumentCollection { @Hidden @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty", required = false) - public double INDEL_GAP_CONTINUATION_PENALTY = 10.0; + public byte INDEL_GAP_CONTINUATION_PENALTY = 10; @Hidden @Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty", required = false) - public double INDEL_GAP_OPEN_PENALTY = 45.0; + public byte INDEL_GAP_OPEN_PENALTY = 45; @Hidden @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 890ed9e3d..171c42040 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -31,7 +31,9 @@ import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,13 +43,14 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Map; public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; private boolean DEBUG = false; - private boolean bandedLikelihoods = false; + private boolean bandedLikelihoods = true; private static final int MAX_CACHED_QUAL = 127; @@ -60,12 +63,12 @@ public class PairHMMIndelErrorModel { private static final int START_HRUN_GAP_IDX = 4; private static final int MAX_HRUN_GAP_IDX = 20; - private static final double MIN_GAP_OPEN_PENALTY = 30.0; - private static final double MIN_GAP_CONT_PENALTY = 10.0; - private static final double GAP_PENALTY_HRUN_STEP = 1.0; // each increase in hrun decreases gap penalty by this. + private static final byte MIN_GAP_OPEN_PENALTY = 30; + private static final byte MIN_GAP_CONT_PENALTY = 10; + private static final byte GAP_PENALTY_HRUN_STEP = 1; // each increase in hrun decreases gap penalty by this. - private final double[] GAP_OPEN_PROB_TABLE; - private final double[] GAP_CONT_PROB_TABLE; + private final byte[] GAP_OPEN_PROB_TABLE; + private final byte[] GAP_CONT_PROB_TABLE; ///////////////////////////// // Private Member Variables @@ -86,42 +89,42 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { this.DEBUG = deb; - this.bandedLikelihoods = bandedLikelihoods; + //this.bandedLikelihoods = bandedLikelihoods; // fill gap penalty table, affine naive model: - this.GAP_CONT_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; - this.GAP_OPEN_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; + this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; + this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - double gop = -indelGOP/10.0; - double gcp = -indelGCP/10.0; for (int i = 0; i < START_HRUN_GAP_IDX; i++) { - GAP_OPEN_PROB_TABLE[i] = gop; - GAP_CONT_PROB_TABLE[i] = gcp; + GAP_OPEN_PROB_TABLE[i] = indelGOP; + GAP_CONT_PROB_TABLE[i] = indelGCP; } double step = GAP_PENALTY_HRUN_STEP/10.0; - double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob - double maxGCP = -MIN_GAP_CONT_PENALTY/10.0; // phred to log prob + // initialize gop and gcp to their default values + byte gop = indelGOP; + byte gcp = indelGCP; + // all of the following is computed in QUal-space for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { - gop += step; - if (gop > maxGOP) - gop = maxGOP; + gop -= GAP_PENALTY_HRUN_STEP; + if (gop < MIN_GAP_OPEN_PENALTY) + gop = MIN_GAP_OPEN_PENALTY; - gcp += step; - if(gcp > maxGCP) - gcp = maxGCP; + gcp -= step; + if(gcp < MIN_GAP_CONT_PENALTY) + gcp = MIN_GAP_CONT_PENALTY; GAP_OPEN_PROB_TABLE[i] = gop; GAP_CONT_PROB_TABLE[i] = gcp; } } - static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) { + static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG // 001000012345000000 @@ -155,7 +158,7 @@ public class PairHMMIndelErrorModel { private void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, - double[] currentGOP, double[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { + byte[] currentGOP, byte[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { if (indI > 0 && indJ > 0) { final int im1 = indI -1; final int jm1 = indJ - 1; @@ -168,20 +171,20 @@ public class PairHMMIndelErrorModel { matchMetricArray[indI][indJ] = pBaseRead + MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[im1][jm1], XMetricArray[im1][jm1], YMetricArray[im1][jm1]}); - final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : -(double)currentGOP[im1]/10.0; + final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : -(double)currentGCP[im1]/10.0; XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); // update Y array - final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : -(double)currentGOP[im1]/10.0; + final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : -(double)currentGCP[im1]/10.0; YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); } } private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, - double[] currentGOP, double[] currentGCP, int indToStart, + byte[] currentGOP, byte[] currentGCP, int indToStart, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { final int X_METRIC_LENGTH = readBases.length+1; @@ -349,8 +352,9 @@ public class PairHMMIndelErrorModel { } - private void fillGapProbabilities(int[] hrunProfile, - double[] contextLogGapOpenProbabilities, double[] contextLogGapContinuationProbabilities) { + private void fillGapProbabilities(final int[] hrunProfile, + final byte[] contextLogGapOpenProbabilities, + final byte[] contextLogGapContinuationProbabilities) { // fill based on lookup table for (int i = 0; i < hrunProfile.length; i++) { if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) { @@ -372,27 +376,8 @@ public class PairHMMIndelErrorModel { final int readCounts[] = new int[pileup.getNumberOfElements()]; int readIdx=0; - LinkedHashMap gapOpenProbabilityMap = new LinkedHashMap(); - LinkedHashMap gapContProbabilityMap = new LinkedHashMap(); - - // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes. - // todo -- refactor into separate function - for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - byte[] haplotypeBases = haplotype.getBases(); - double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length]; - double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length]; - - // get homopolymer length profile for current haplotype - int[] hrunProfile = new int[haplotypeBases.length]; - getContextHomopolymerLength(haplotypeBases,hrunProfile); - fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - - gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities); - gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities); - - } + PairHMM pairHMM = new PairHMM(bandedLikelihoods); for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations readCounts[readIdx] = p.getRepresentativeCount(); @@ -408,12 +393,27 @@ public class PairHMMIndelErrorModel { else { // System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read.isEmpty()) continue; - if(ReadUtils.is454Read(read)) { + if (read.getUnclippedEnd() > ref.getWindow().getStop()) + read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop()); + + if (read.isEmpty()) continue; - } + + if (read.getUnclippedStart() < ref.getWindow().getStart()) + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart()); + + if (read.isEmpty()) + continue; + // hard-clip low quality ends - this may introduce extra H elements in CIGAR string + read = ReadClipper.hardClipLowQualEnds(read,(byte)BASE_QUAL_THRESHOLD ); + + if (read.isEmpty()) + continue; + // get bases of candidate haplotypes that overlap with reads final int trailingBases = 3; @@ -469,54 +469,56 @@ public class PairHMMIndelErrorModel { unclippedReadBases = read.getReadBases(); unclippedReadQuals = read.getBaseQualities(); - // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, - // and may leave a string of Q2 bases still hanging off the reads. - for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) { - if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) - numStartClippedBases++; - else - break; + final int extraOffset = Math.abs(eventLength); - } - for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){ - if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) - numEndClippedBases++; - else - break; - } + /** + * Compute genomic locations that candidate haplotypes will span. + * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord, + * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above. + * We will propose haplotypes that overlap the read with some padding. + * True read start = readStart + numStartClippedBases - ReadUtils.getFirstInsertionOffset(read) + * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart. + * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to + * differentiate context between two haplotypes + */ + long startLocationInRefForHaplotypes = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); + long stopLocationInRefForHaplotypes = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; - int extraOffset = Math.abs(eventLength); + if (DEBUG) + System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); - long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); - long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; - - // Variables start and stop are coordinates (inclusive) where we want to get the haplotype from. int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; // check if start of read will be before start of reference context - if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut - start = ref.getWindow().getStart(); - + if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { + // read starts before haplotype: read will have to be cut + //numStartClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; + startLocationInRefForHaplotypes = ref.getWindow().getStart(); + } // check also if end of read will go beyond reference context - if (stop > ref.getWindow().getStop()) - stop = ref.getWindow().getStop(); + if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { + //numEndClippedBases += stopLocationInRefForHaplotypes - ref.getWindow().getStop(); + stopLocationInRefForHaplotypes = ref.getWindow().getStop(); + } - // if there's an insertion in the read, the read stop position will be less than start + read length, + // if there's an insertion in the read, the read stop position will be less than start + read legnth, // but we want to compute likelihoods in the whole region that a read might overlap - if (stop <= start + readLength) { - stop = start + readLength-1; + if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) { + stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; } // ok, we now figured out total number of clipped bases on both ends. // Figure out where we want to place the haplotype to score read against - /* - if (DEBUG) - System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); - */ + + if (DEBUG) + System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); LinkedHashMap readEl = new LinkedHashMap(); + /** + * Check if we'll end up with an empty read once all clipping is done + */ if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) { int j=0; for (Allele a: haplotypeMap.keySet()) { @@ -537,67 +539,81 @@ public class PairHMMIndelErrorModel { // initialize path metric and traceback memories for likelihood computation double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; - double[] previousGOP = null; - double[] previousGCP = null; - int startIdx; + int startIndexInHaplotype = 0; + final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; + final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; + + // get homopolymer length profile for current haplotype + int[] hrunProfile = new int[readBases.length]; + getContextHomopolymerLength(readBases,hrunProfile); + fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); + + for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - if (stop > haplotype.getStopPosition()) - stop = haplotype.getStopPosition(); - if (start < haplotype.getStartPosition()) - start = haplotype.getStartPosition(); + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); - // cut haplotype bases - long indStart = start - haplotype.getStartPosition(); - long indStop = stop - haplotype.getStartPosition(); + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); double readLikelihood; if (DEBUG) System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength(), read.getCigar().toString()); + indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); + if (indStart < 0 || indStop >= haplotype.getBases().length || indStart > indStop) { // read spanned more than allowed reference context: we currently can't deal with this - readLikelihood =0; + throw new ReviewedStingException("BUG! bad read clipping"); +// readLikelihood =0; } else { final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - if (matchMetricArray == null) { - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; + final int X_METRIC_LENGTH = readBases.length+1; + final int Y_METRIC_LENGTH = haplotypeBases.length+1; + if (matchMetricArray == null) { + //no need to reallocate arrays for each new haplotype, as length won't change matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + } - final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); - final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); + + pairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + + /* if (previousHaplotypeSeen == null) - startIdx = 0; - else { - final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); - final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP); - final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP); - startIdx = Math.min(Math.min(s1, s2), s3); - } + startIndexInHaplotype = 0; + else + startIndexInHaplotype = 0; //computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); + previousHaplotypeSeen = haplotypeBases.clone(); - previousGOP = currentContextGOP.clone(); - previousGCP = currentContextGCP.clone(); + */ + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + contextLogGapOpenProbabilities, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities, + startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); + /* double r2 = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, contextLogGapOpenProbabilities, + contextLogGapContinuationProbabilities, 0, matchMetricArray, XMetricArray, YMetricArray); - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, - currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray); - - if (DEBUG) { + if (readLikelihood > 0) { + int k=0; + } + */ if (DEBUG) { System.out.println("H:"+new String(haplotypeBases)); System.out.println("R:"+new String(readBases)); System.out.format("L:%4.2f\n",readLikelihood); - System.out.format("StPos:%d\n", startIdx); + // System.out.format("Lorig:%4.2f\n",r2); + System.out.format("StPos:%d\n", startIndexInHaplotype); } } readEl.put(a,readLikelihood); diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java new file mode 100644 index 000000000..7d393274a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.*; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class PairHMM { + private static final int MAX_CACHED_QUAL = (int)Byte.MAX_VALUE; + private static final byte DEFAULT_GOP = (byte) 45; + private static final byte DEFAULT_GCP = (byte) 10; + private static final double BANDING_TOLERANCE = 22.0; + private static final int BANDING_CLUSTER_WINDOW = 12; + private final boolean doBanded; + + public PairHMM() { + doBanded = false; + } + + public PairHMM( final boolean doBanded ) { + this.doBanded = doBanded; + } + + + public void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, + final int X_METRIC_LENGTH) { + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + + } + + @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability + public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, + final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions + final int X_METRIC_LENGTH = readBases.length + 1; + final int Y_METRIC_LENGTH = haplotypeBases.length + 1; + + // initial arrays to hold the probabilities of being in the match, insertion and deletion cases + final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + + return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); + } + + @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability + public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, + final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions + final int X_METRIC_LENGTH = readBases.length + 1; + final int Y_METRIC_LENGTH = haplotypeBases.length + 1; + + if( doBanded ) { + final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step + final ArrayList workToBeAdded = new ArrayList(); + final ArrayList calculatedValues = new ArrayList(); + final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; + workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype + + for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over + //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order + int el = 1; + for( int work : workQueue ) { + // choose the appropriate diagonal baseline location + int iii = 0; + int jjj = diag; + if( diag > Y_METRIC_LENGTH ) { + iii = diag - Y_METRIC_LENGTH; + jjj = Y_METRIC_LENGTH; + } + // move to the starting work location along the diagonal + iii += work; + jjj -= work; + while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { + iii--; + jjj++; + work--; + } + if( !detectClusteredStartLocations(workToBeAdded, work ) ) { + workToBeAdded.add(work); // keep this thread going once it has started + } + + if( work >= el - 3 ) { + // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value + double maxElement = Double.NEGATIVE_INFINITY; + for( el = work; el < numDiags + 1; el++ ) { + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, + insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); + final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); + calculatedValues.add(bestMetric); + if( bestMetric > maxElement ) { + maxElement = bestMetric; + } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { + break; + } + if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix + break; + } + if( --jjj <= 0 ) { // don't walk off the edge of the matrix + break; + } + } + + // find a local maximum to start a new band in the work queue + double localMaxElement = Double.NEGATIVE_INFINITY; + int localMaxElementIndex = 0; + for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { + final double bestMetric = calculatedValues.get(kkk); + if( bestMetric > localMaxElement ) { + localMaxElement = bestMetric; + localMaxElementIndex = kkk; + } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum + if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { + workToBeAdded.add( work + localMaxElementIndex ); + } + break; + } + } + calculatedValues.clear(); + + // reset iii and jjj to the appropriate diagonal baseline location + iii = 0; + jjj = diag; + if( diag > Y_METRIC_LENGTH ) { + iii = diag - Y_METRIC_LENGTH; + jjj = Y_METRIC_LENGTH; + } + // move to the starting work location along the diagonal + iii += work-1; + jjj -= work-1; + + // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value + for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, + insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); + final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); + if( bestMetric > maxElement ) { + maxElement = bestMetric; + } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { + break; + } + } + } + } + workQueue.clear(); + workQueue.addAll(workToBeAdded); + workToBeAdded.clear(); + } + } else { + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = ( readQuals[im1-1] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[im1-1]) ); + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10( + new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0}); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length - 1 ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length - 1 ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } + + // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other + private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { + for(int x : list) { + if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { + return true; + } + } + return false; + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java new file mode 100644 index 000000000..6f76cf520 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class PairHMMUnitTest extends BaseTest { + final static boolean EXTENSIVE_TESTING = true; + PairHMM hmm = new PairHMM( false ); // reference implementation + PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private class BasicLikelihoodTestProvider extends TestDataProvider { + final String ref, read; + final byte[] refBasesWithContext, readBasesWithContext; + final int baseQual, insQual, delQual, gcp; + final int expectedQual; + final static String CONTEXT = "ACGTAATGACGATTGCA"; + final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; + final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; + + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + } + + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + super(BasicLikelihoodTestProvider.class, String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); + this.baseQual = baseQual; + this.delQual = delQual; + this.insQual = insQual; + this.gcp = gcp; + this.read = read; + this.ref = ref; + this.expectedQual = expectedQual; + + refBasesWithContext = asBytes(ref, left, right); + readBasesWithContext = asBytes(read, false, false); + } + + public double expectedLogL() { + return expectedQual / -10.0; + } + + public double tolerance() { + return 0.1; // TODO FIXME arbitrary + } + + public double calcLogL() { + + double logL = hmm.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), + qualAsBytes(gcp, false)); + + return logL; + } + + private final byte[] asBytes(final String bases, final boolean left, final boolean right) { + return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + } + + private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { + final byte phredQuals[] = new byte[readBasesWithContext.length]; + // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM + Arrays.fill(phredQuals, (byte)100); + + // update just the bases corresponding to the provided micro read with the quality scores + if( doGOP ) { + phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + } else { + for ( int i = 0; i < read.length(); i++) + phredQuals[i + CONTEXT.length()] = (byte)phredQual; + } + + return phredQuals; + } + } + + final Random random = new Random(87865573); + private class BandedLikelihoodTestProvider extends TestDataProvider { + final String ref, read; + final byte[] refBasesWithContext, readBasesWithContext; + final int baseQual, insQual, delQual, gcp; + final int expectedQual; + final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC"; + final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA"; + final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT"; + final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA"; + final byte[] baseQuals, insQuals, delQuals, gcps; + + public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + } + + public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); + this.baseQual = baseQual; + this.delQual = delQual; + this.insQual = insQual; + this.gcp = gcp; + this.read = read; + this.ref = ref; + this.expectedQual = expectedQual; + + refBasesWithContext = asBytes(ref, left, right); + readBasesWithContext = asBytes(read, false, false); + baseQuals = qualAsBytes(baseQual); + insQuals = qualAsBytes(insQual); + delQuals = qualAsBytes(delQual); + gcps = qualAsBytes(gcp, false); + } + + public double expectedLogL() { + double logL = hmm.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + baseQuals, insQuals, delQuals, gcps); + + return logL; + } + + public double tolerance() { + return 0.2; // TODO FIXME arbitrary + } + + public double calcLogL() { + + double logL = bandedHMM.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + baseQuals, insQuals, delQuals, gcps); + + return logL; + } + + private final byte[] asBytes(final String bases, final boolean left, final boolean right) { + return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + } + + private byte[] qualAsBytes(final int phredQual) { + return qualAsBytes(phredQual, true); + } + + private byte[] qualAsBytes(final int phredQual, final boolean addRandom) { + final byte phredQuals[] = new byte[readBasesWithContext.length]; + Arrays.fill(phredQuals, (byte)phredQual); + if(addRandom) { + for( int iii = 0; iii < phredQuals.length; iii++) { + phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3)); + } + } + return phredQuals; + } + } + + @DataProvider(name = "BasicLikelihoodTestProvider") + public Object[][] makeBasicLikelihoodTests() { + // context on either side is ACGTTGCA REF ACGTTGCA + // test all combinations + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,6,7,8,9,10,20) : Arrays.asList(2); + + for ( final int baseQual : baseQuals ) { + for ( final int indelQual : indelQuals ) { + for ( final int gcp : gcps ) { + + // test substitutions + for ( final byte refBase : BaseUtils.BASES ) { + for ( final byte readBase : BaseUtils.BASES ) { + final String ref = new String(new byte[]{refBase}); + final String read = new String(new byte[]{readBase}); + final int expected = refBase == readBase ? 0 : baseQual; + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + } + } + + // test insertions and deletions + for ( final int size : sizes ) { + for ( final byte base : BaseUtils.BASES ) { + final int expected = indelQual + (size - 2) * gcp; + + for ( boolean insertionP : Arrays.asList(true, false)) { + final String small = Utils.dupString((char)base, 1); + final String big = Utils.dupString((char)base, size); + + final String ref = insertionP ? small : big; + final String read = insertionP ? big : small; + + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + } + } + } + } + } + } + + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); + } + + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { + double calculatedLogL = cfg.calcLogL(); + double expectedLogL = cfg.expectedLogL(); + logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + } + + @DataProvider(name = "BandedLikelihoodTestProvider") + public Object[][] makeBandedLikelihoodTests() { + // context on either side is ACGTTGCA REF ACGTTGCA + // test all combinations + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,6,7,8,9,10,20) : Arrays.asList(2); + + for ( final int baseQual : baseQuals ) { + for ( final int indelQual : indelQuals ) { + for ( final int gcp : gcps ) { + + // test substitutions + for ( final byte refBase : BaseUtils.BASES ) { + for ( final byte readBase : BaseUtils.BASES ) { + final String ref = new String(new byte[]{refBase}); + final String read = new String(new byte[]{readBase}); + final int expected = refBase == readBase ? 0 : baseQual; + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + } + } + + // test insertions and deletions + for ( final int size : sizes ) { + for ( final byte base : BaseUtils.BASES ) { + final int expected = indelQual + (size - 2) * gcp; + + for ( boolean insertionP : Arrays.asList(true, false)) { + final String small = Utils.dupString((char)base, 1); + final String big = Utils.dupString((char)base, size); + + final String ref = insertionP ? small : big; + final String read = insertionP ? big : small; + + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + } + } + } + } + } + } + + return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); + } + + @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) + public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { + double calculatedLogL = cfg.calcLogL(); + double expectedLogL = cfg.expectedLogL(); + logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + } +} \ No newline at end of file From c78b0eee3a33c44617c273c216d12ffafe7fa885 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 17 Apr 2012 14:22:48 -0400 Subject: [PATCH 02/17] Refactoring/fixing up UG HMM code: a) Make code use PairHMM class instead of having duplicated code. That way UG and HaplotypeCaller now use same core code. Changes to be able to do this: 1. Compute context-dependent GOP as a function of read, not of haplotype, b) Extracted code to initialize HMM arrays into separate method, c) Move PairHMM class and unit test to public, d) Reenable banded code in PairHMM, inverted sense of flag (true=enable feature) but leave off in HaplotypeCaller. --- licensing/GATK1_LICENSE | 22 ++ licensing/GATK2_beta_license.doc | Bin 0 -> 43520 bytes licensing/LICENSE | 22 ++ .../utils/R/gsalib/R/gsa.variantqc.utils.R | 236 +++++++++++++++ .../executive/HierarchicalMicroScheduler.java | 82 +++--- .../sting/gatk/executive/ShardTraverser.java | 28 +- .../sting/gatk/executive/TreeReducer.java | 33 +-- .../sting/gatk/io/stubs/VCFWriterStub.java | 50 ++-- .../gatk/refdata/RefMetaDataTracker.java | 8 + .../sting/gatk/report/GATKReportTable.java | 59 ++-- .../traversals/TraverseActiveRegions.java | 3 +- .../gatk/walkers/ActiveRegionExtension.java | 1 + .../gatk/walkers/ActiveRegionWalker.java | 9 +- .../sting/gatk/walkers/FlagStatWalker.java | 24 +- .../annotator/BaseQualityRankSumTest.java | 31 +- .../walkers/annotator/ChromosomeCounts.java | 12 +- .../walkers/annotator/DepthOfCoverage.java | 21 +- .../gatk/walkers/annotator/FisherStrand.java | 57 +++- .../walkers/annotator/InbreedingCoeff.java | 12 +- .../annotator/MappingQualityRankSumTest.java | 23 +- .../gatk/walkers/annotator/QualByDepth.java | 41 ++- .../walkers/annotator/RMSMappingQuality.java | 36 ++- .../gatk/walkers/annotator/RankSumTest.java | 51 +++- .../walkers/annotator/ReadPosRankSumTest.java | 27 +- .../annotator/VariantAnnotatorEngine.java | 29 +- .../ActiveRegionBasedAnnotation.java | 18 ++ .../diagnostics/targets/DiagnoseTargets.java | 198 ++++++------- .../targets/IntervalStatistics.java | 25 +- .../genotyper/ConsensusAlleleCounter.java | 20 +- .../genotyper/UnifiedArgumentCollection.java | 17 +- .../walkers/genotyper/UnifiedGenotyper.java | 21 +- .../genotyper/UnifiedGenotyperEngine.java | 5 + .../indels/PairHMMIndelErrorModel.java | 40 ++- .../varianteval/VariantEvalReportWriter.java | 2 +- .../varianteval/VariantEvalWalker.java | 24 +- .../varianteval/evaluators/IndelSummary.java | 83 ++++-- .../evaluators/VariantEvaluator.java | 38 +++ .../DynamicStratification.java | 69 +++++ .../stratifications/OneBPIndel.java | 2 +- .../manager/StratificationManager.java | 129 +++++++- .../varianteval/util/EvaluationContext.java | 31 +- .../VariantDataManager.java | 4 +- .../walkers/variantutils/CombineVariants.java | 10 +- .../walkers/variantutils/SelectHeaders.java | 250 ++++++++++++++++ .../gatk/GATKExtensionsGenerator.java | 3 +- .../broadinstitute/sting/utils/PairHMM.java | 2 +- .../broadinstitute/sting/utils/R/RUtils.java | 90 ++++++ .../sting/utils/SampleUtils.java | 99 +------ .../org/broadinstitute/sting/utils/Utils.java | 14 + .../utils/activeregion/ActiveRegion.java | 7 +- .../utils/activeregion/ActivityProfile.java | 66 +++-- .../utils/codecs/vcf/AbstractVCFCodec.java | 7 +- .../sting/utils/codecs/vcf/VCFHeader.java | 73 ++++- .../pileup/AbstractReadBackedPileup.java | 36 ++- .../sting/utils/pileup/PileupElement.java | 2 - .../sting/utils/pileup/ReadBackedPileup.java | 11 + .../sting/utils/text/ListFileUtils.java | 176 ++++++++++- .../sting/utils/text/XReadLines.java | 136 +++++---- .../variantcontext/GenotypeLikelihoods.java | 6 +- .../org/broadinstitute/sting/WalkerTest.java | 16 +- .../gatk/EngineFeaturesIntegrationTest.java | 16 +- .../sting/gatk/report/GATKReportUnitTest.java | 47 ++- .../gatk/walkers/FlagStatIntegrationTest.java | 20 ++ ...ntReadsInActiveRegionsIntegrationTest.java | 2 +- .../ExactAFCalculationModelUnitTest.java | 14 + .../UnifiedGenotyperIntegrationTest.java | 31 +- .../VariantEvalIntegrationTest.java | 12 +- .../VariantEvalWalkerUnitTest.java | 277 ++++++++++++++++++ .../sting/utils/R/RUtilsUnitTest.java | 64 ++++ .../activeregion/ActivityProfileUnitTest.java | 2 +- .../utils/text/ListFileUtilsUnitTest.java | 77 ++++- .../qscripts/examples/ExampleReadFilter.scala | 47 +++ .../queue/extensions/gatk/GATKIntervals.scala | 1 - .../extensions/gatk/VcfGatherFunction.scala | 3 +- .../sting/queue/pipeline/PipelineTest.scala | 2 +- .../ExampleReadFilterPipelineTest.scala | 90 ++++++ 76 files changed, 2737 insertions(+), 615 deletions(-) create mode 100644 licensing/GATK1_LICENSE create mode 100644 licensing/GATK2_beta_license.doc create mode 100644 licensing/LICENSE create mode 100644 public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/R/RUtils.java create mode 100755 public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java create mode 100644 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala create mode 100644 public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala diff --git a/licensing/GATK1_LICENSE b/licensing/GATK1_LICENSE new file mode 100644 index 000000000..648ec8fc3 --- /dev/null +++ b/licensing/GATK1_LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2012 The Broad Institute + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/licensing/GATK2_beta_license.doc b/licensing/GATK2_beta_license.doc new file mode 100644 index 0000000000000000000000000000000000000000..4fa04a3f62b33c4f8e60f886483f6a35e99e5e29 GIT binary patch literal 43520 zcmeHw349bq7I#f95|ThT5^m^(a3_<4aLAcSG9iH^6LWAZ2uw1QWMDEAW+n*{SWylI zFBC-tTtILY5JUk*Km-v41O+i1f^sPkMdc9CbtT{bRd>&oL_l}H-{*em{QlG3)zz=6 z-h1`x)$8icp--EfS^j40v&?N9#5~#UYCq=Vl&-*WhI}5x*!?)BwA<$TS$GPc@-u}_d5g7mMkM-mHYgP&UV7o~Lfy|D{p zAj3@9({?xe2;_UOJL-fR&EWpaINyx>gNcCKMtnfk7?q1#YHaTHX@W*lJ0tOf&dU~o zZhP#Pb`Wos`6}<__KH`<_MgPbLEJ}tk?(Wg4+MQ9B=1hg{havYyvcN&)9&V2Zf9Ne zh^_^M)^`%+eK$1kCQnjl*Id-;DP>2&n{Ur*d(f`?a{8)vjNL@47w&81bn8g)f%Nx0 z_R-jr{?XPH`C_qeqU5J`e$zS0Q7-?F@=c^F(xLL*x4qD>-bfrt?Xg)HZKTZ ztYJ&WFc7c?EI`qi1>(rhs|m)H`tAQ^4kTo0R7pZ=Ms`kWPHv9cKUh#n^Aw!c9nW#xmS7#-r zs?vlkb+%fSm6#%A=4NGTvenswCL?WBpnp!0!78WcTLaNDXGuTRP`Uo86D}{1{tw_)cB^HAz-%z483R<1cqPJQFtxf17Bx;NE zECyYnUPuI4y(Pkt5+S6kgy`6qxLBd5)Zh zz52Os5j-2Js4pnc=i3apjKZjjI;~AF9+=BbMpV~eDx|jLn~O_~dYfLy?<}#HOF-D7 zFEm(f7Ol--Hbof?`FfL8uNRz3Hkb;`mSTRLl8X&CqR@4|IyqULn3FnOEhMROP#?jEE3&NJ{7(>ImW+u3HT{w3qV8 zErf0q8?0!YNTEz`LU~eW5akrXIZ>ju*bK-IpB8+H5tKTJrT` z`4dg(M%1ct;>MX`(HnUUb>{q1l09c17+Qd<6BXt~6r+6PgAGBM^2uUKO19=q!S54mGWm6R)fSpD?dLRr5 z1ZahVqHmR2$*gf+J1*wK+!mt?qgzJ8Tovmr`36{1^k%)*l3ygChn1M2nBXwaM;Kmg0`fDtcN2LDGhABp+s+l z<)Y3Gi>nnNL&y&-pq9Z{;0_BZuPA&wN%XBUXfL<$R*?-*Yl~>HXs|U8oQAig{UVu0s~>X1R=vrdLSAavMeB@(uo!Y zg302BQHeni;t=#O6gH_t%Pk=rY&TOD0X(;1L?aP6Rh3(!@*-#n(Fc3Uute}t)m&<$ zVHL^c78-hR4Yti-wGv-=XW`b-uET243RzJfZih>7zf(!n4WK$0(9M>K7$Ec(Snmib zRg{<2P;4-2Avb+~k;!Z{7n0<`S5gG9Qk$QTQC$=>Iy$PVGeDZ9U_~AB_C`fO&P-Bp zsR)t`#UxbTW@s*&O%+A+_<(DI{Xu8bq8+42#d=$jS?8_vhgfIV~+uu8y~9@ zvcm94V?9xF?xT`S&;T;`HcKg~pP5cD(xHe7K^oyoOTZty=%CA}BBUo-H;k#~5@@}H zUl^Tv2OtvUUO=uE^!5TGZGoTzQ$KtyWe(r3h$ z$(BemKy*L62JTsr??+l{w2MNv|3aSD3Nerri90e4nLpI{BCSP7ydsw9O7mfL?c6UY zwQ$MvQMH!t?NVQhUhWJ)Iy!(<4n)FB} zi($^QK`%+C9L@%%lQK)*ki&A;RdcB)T3{Y5xKD2?gxQ3ifDX)o7X7K1oGmI?A_i_@ z;O8a_?3Z|sp{l1+NQ4`anxxKBkwX#apIV0pBbgkln`StJaAcCg(TV!Q+WXhLZH9Bz1n7NG{Pr{cNU z0&8rqvs@YKO5is5DN{gKh3|_(;6%e{fHnyI9<$3y!?;HRF{9Ifi}qq13N6|uo)3oB zVt|2lsjBFul2=t~q-hQBp6I=T< zIWVI(&h-<0E^=l`hhWN!g>qPTKK0g?X)zv)=D}hvEiB^wv`}x-Tg+Ar2P6f$k$C5H zzvKzkI#~}4w6n}%LaEi%Ec=K>{fVClCT%enwqsbLjAdHlD+CX@ksXzeahEnP4?S4E zi28Anlcs^y&O{+{$_`>w1Rp2N1?a!9k`Y3IHXp*laA&oZ(p(2gdW^;93b=aWePDt( zClyuIF)UieiK!Se;6~I*O=S-nLUUsSx-cJN(YZ*OG1!Cen0<2{gsPG-++6&~zGO$cKQf z1qEs@EQqXY;Df{FkdsE8i8EU=ZZxjKy73SITEzJT&)QCaYYle|NC=K(=a>jN^maxS zB1(YSZ7Ire>-sflW3=i8zc>T&r;OC()I@2#vXc^35B{UR*kmYxoDm1nn{=WTfF}%J zMr$cHQVdB_JW_G61~wEOw}f~{uH#)Q3`SWqrcyNA8%$_~%}}Vd!Ax-$z%$?qR%;1h zDEYj|8v*x<&%sd<&@&X8kkoa4s@MjagYc+SAhxJ`2#s+n|ywHx_fr5GjoWQ&xrnC65!ag?EA zNo$}(P8B1yf_0`5oivWydoFXM)=-T4SWBVF;61;L1M*r;I<9ULN_B-a!>37_f#$9P zIeRFA?ng0Z^az~Uo9fgc#|*~BHxvtzH67iknAh0Mr#xBoQ7a6ufTi49Or z_>5dJP^{c-xY~==hvE>!Fx|#EdE49z?M7~tz3b6s-btt!d3q*0(WMY5?3Bw40_xlw z6ATSPr}GFzeIIb27X4M22thrP>d5Ps znx2`Ks!l?fHX|`DHwpJf2qRK+QZ%_a=+^0}Vl$9tZvz!capDv-NtKYAmYOpv0@13R z45G+`%0i|pD1rZQdrUx6 zGe~ZrhQ?&4sM6A;I->fiIoT9k&y*v>saZJEWTB0kWEep>dXNr8q8X)D<4}dfv{clL zBK^bFi1iOqQFh2Y$#F58>Vs@x2NwWDi5UQ-lSaIx34!|lT%1#6KS`CY8ba}Whu+*# z_G*yO&{G2Y&1gu*(A=z15=TWZl##o<_D3=|+~ zcxpCjpqP@PLd0BTI>pL08C1qF9+)NmWeCZtM47FWOf4Zj!eff;R8bw}B#Pk<(Lm$S zIr}ID6*3ITpeG`=J;=a(n{2C*hIacPME1mP3b?g1W5~zI03!(-!fm{0X6?6-=_CV4 z2G%t$L!kxONwQ@k3&bG~ZiR^tm?*2vY|x>yeU(BwbR3?(DnqTwh1uwfxTAV<2~41n zNKUlb=;S|(qb_Pw3eyP3n~&HA8>~2Yp5YZ$NERLEuEBYX8AOvi=4PW{)Rv2535;kd z0=T@H(nZAb5I0AwqRx$~(ZHtXR zLBRDyF;D>4UR7+S(FV2P*4R{NWibdt42xFB-wR?J5}J&T{zj=e0nrQ3e5dL9ulMp5+A@sPpSaZ z2R;>s$3V^?CST%>Tw}PiX+R)K=-M;78+?=suvk=H3iwgPn;Kf9>ITGebmUDLE0jVm z#mwZa;E7X=LF9?#q)LRj?;!_k+?1LW&J50>PNR5AQFJo=W1m z5IRhS9=@YA2cQlTQ5!=Mm?#4M2f6H=qNh$(rgp>Arl~3gtxBY*J%aD#ZaQWU(1e}{ z4p_^YvEzo`h5im(Y*A{!P zm;oNLjRJrOnBqYiuN5X(CaRp?fM_b&3+Rzvhlqj-N_ThM($Rx*c;@ye&^*v1(NrRC z(^uIGW(@NhZrwm#j1pj*txJCp}yzP0!CaTXbB(?Qv3` zodzlHiAd}s&&6yi*JF7_5mhoBoWKx`(3*`V*SX4UrAq1Wp2UePX%T8vfJrFlJ0}P? zfY#B#?jua)85rm#{{{v&!|ZE_T4){{9EeRmi=i{`R5Ht^i9&HH;t!%H;)w3UxX7j- z-U*GFC=tUobrEVbEwE7rdaI09GiD4DC2r<%r_F2zRdi(Tw8)Q9K}CElE&EM@-XqX| z@Idxin*gtsOHcBZr4ta=T0ma&h=qVUA;M(EQC^87;(nA>PWBf^z9(`@F zKpv3QY_A@2<>UD>;@tEYk*t{{3g<;TXc=hcl@T9TvJ)?_U;w8B|6f`V z=F3=Ppa&2E!~-fo4P*ch09L>TlmgEI>wu4eQ@~fikHA&nC*T(##E-Gdl{?qISUKy( zO4Io4)a=y3+4NVV-AzM^ZnW6Kv_o66;uJq-1sKxu4f9;nS_T(wns9@K6dmeqbG_2W z`ARR}hn-19$k%pu3uCyJ&|RS!iq{qZIa^5))rpteIi%=s#rb!Y?}rJK?keBbiNo(I zAMxOh^|QBzcyPzuXbo}kPWggqoj=K)WKJ^P010dajyA+00OWKSPr}atOwo#NFTNWw*g;}ssuoKsG=+lMCbny8+UlkAO;m z^ynDy1#l9$1hj%qwFeRa(y^OW;@_!Hsy?hbRVDwPs@k}+>V>&e&3f}(XD&J&!*1*u zmTT|XyieQhMvW_WyFph5byvs+fo{WHlv+DI@e1{9=UZ5NH(TRI(s5ntz3$g*vxN3K zO7}PyXm1&b+xY_3epBtX5@qBL7~zt;quj4k!2A1wIl!~Pe1KMUzX&V>76a#i-+
k!j7M?&81Opq3GhAO zjaQPI0ynEpSMJ@ucYD>w%8iv(l{Z&cExWlLfAeoXIe)rUUp0DJ6&MYU)zAW$-s0BI zy7^+iQiIP8#oiJ<6ffJ@rExqev4%Wr@EWx%_`9`6KJqTyb>XsNt!tXSq#9R1^$De? zYi^`v`<-=7vs0+iBII_je@EhZYoI$21;hY*fqlS!-~@0P_#OztSRVqk09pdA02=?> z108@^;8%b(#oKkjBw!}69k>Pj4EzFwHe>8fpcw`r$h+#Zz1!cUzrCMTt%Qw7ilwTE z|BasBthaCGj4X(iM0r{(jXI<G)*8cExQ9@1*q5PQ9-#_;UjjGV(t)o77jC)`K4P~yGpuW!LI z`RXI_=3gF=5A4J%rXK*ifJ)#f;Ey*>g8&Us1kA&mjn4qj0-J!hffK-K-~w<7@WWe} zjer(FE1(mQ3M>K^18afn7fxL`vVZ56ooioua_tQKm5B#N92(c=tsS|RTmC0n4z9tg z8gd8!+h{hp-(-yyX}Ikc62$+vKvkq$UgZr+~`<>B$ek zRe+(`=b>*(@$vo6i;RL5nE1yr-k_*ow3HjMT0sPLo4VEwx8yKO|~Lk z_Ri=`oZ(RpcbU}Syj_>bTX!3_Ym=cpKT&eaU$@5AEqVJDcdhD9cG;Qd9dRBGbOJg9 zF+eQP6DSAX2CR{o-vji0kl)b7?b{9e5e{6g(4@`2xm@`p_qY(aXs3&Vx=&p0`O5j8 zzwU8Oo3CgG_pFH={IkdU^poMHH3&kwhc?`!P)Ewm6li#3>Q`Zav&_5TJQ7d>Q9v{x z>p&ciNe?Ciqzmr=!O#N>;H(!nu2x;WasI~f8&^Nyzi0oRZJRf4{(SXcm(%xDbM)vM zRsj}->=DDBW(W@J<1}BOJeUVKt~t#CWeY@coc^nS&#u*CySPoO&EB!#Wabym8b&KZ z<|(GP%~Y_?K);gE#30sT{h=sf{h5KG>z8?l0UhV{^$MT2%)8TiSm^bKJX!()^A33= zm5?IUZ+}e=%=HRGxJJR66IZ zZ%uVYV0wIKkFhhK`>g>@M-qo3itfB6G1d5M)yr8DzP%);hu}=l3eeh8aL$IdX5!ca z6!PB^R)n-<%3%LsbqG4kB_!aqP!V!(i&3KP!zH%}i?vuuG z4Qa`Vf{TOEt^|cMHyf}-$kWX0NWFl1Rdd#O=9!?efNPBsSUm2}QY1Ll$d6^Cb%m&l z7Q89t^&%RWNk(+ z(l~0pP>VK^1*BF`&HJ+U;K65#zMv6m=cGFg=RVkudExHnu(K254=X-Xz~0(RZ(O7F zY}7dq-|f^xDr9v$@C`~?_9NL?lzmMx(nX5SHb;OB8G(EASU22@ISOmGQpcJDqr(+r z!WF>`YrLp3@-sDYeHSFVl|~ki6xIlJqlaM8KE9k2W!(N4xQ0+|1bkcZ_ePZju4zp? zz+=p{`BaY82S;6W>8@4=_xJ5XX zef@Fe=)IQvKe@1~JoL*5zaQUPbg6YthoD~qlg~y+B%b^9dT>hePK+c{}>~ri~vvq1l$TErIxpq_yg!WuGlQdF-3>Gx~gTx_RZ|VS%F}mNW>Qt3RlG?42u9BVKLu+7~HpcSjh{ zeiVG6YI4hc8=q`;I(4u0{%IX6=gnHX=}Y10rXH1{KSq2x_TBIa10NqU;?j{@-Oj1M z>(RNPN8j2jANseKXNESwLjgpiLTamg3|0H`D2=&|y7~JZueM8`67^=YF6l4#%e{O& zaHCwCqvzHQi&}NL8n|glczBYk?Q0)C>FLpZQ_~l&89daPEC25O)q%_F z%Fj)j{~QaBd-Q1kU#FPA`t`lR*9+emGjM3jes~k;AH8S%Ysx0}Ox)K^(uW*8Q}+H8 zL$CO(vvD6h@#@M}X^#c0Gj03zli+RZ@}3Ryy0&V}M+b%B-A9ibIWIII|CjckrZoI< z_L%UIJ6E)fO!DaP$X7EXBaTk!;qUiP!}P2r)4v$KD>TsW+lkNbOkTci*O{2ytj06b ze%d(UTuMvh#&O4d?|Y`W^C3mSL3Uxz&)TJJS0R=fysgNm*Yy?=#xbLIeD`ipwpG`| z^1$^MKK{nz@a>mh%znb>o0T_A`QKj~zUnkvxM+82zn98h9s6xh)^_%pRhwQ}z2Wc! zYu{=3Qt-Ur?pw3izi3*x?9s`_qv+17V&HjVw-c+!8I_sWFcIq3QC7xpdqA$Cdk0b$+lPkI0RuQMnAyk*#`UEehd^nPLL zYlj|~`?rM;cB?w_@ci`b<;9EQ$8;X#8TwVDKK;uxAMd&J*yyi!s1tI>Ux?YRS(tdf z*MQ-Dudcd$erB%U`w!<{?)g@qCfeX*&om4h81}@9jHH!6?K{=J$8UWzXl2*rtz&*U)Bn`kpx*)}?Wj6c za(elPez&T=Jkz_~eVx;rCB5IuUlZ|0lOMh|n?IPnaN%&z^e?L_{nE!(Zfcn{$|nxEkCzh z_%L;U_Q;T3ng{kpKRlz2pHVmO%(scF&y9Ve=kbllJlE>Nt(6_;#469OFPi<{mR{|9 z-`?N8*9$j7KP+oBr?S!0m#;kAUinGuS6b&+?rmH2_?!hBAK8%cw(Y(V4Y$AdiLSKx z^A`)(v^kQNZ7b;Uz|rQ56Yt9$yTxZlr^V}6sn!R3EEsX-z#Fg5-qole{IhK*TLx=m z?*Dm8!gEuK+s&DjWPEJN*@lZtn{5r7Rq*d4Kb&ZLUx&q=CdPkJmAr7{!li=?!{5z} z?$pXlGrZZbHJf&4*^UPsUC`nM&*tY|ZxwKMq5t6d0grUP(K~2ElT*R7h8}-?;HHrD zXEFwkydlP} zyIp%@lwx^V;;`9?O;=A?-=pKA{=Y6PPZ%^j>8Ep-f&!0^ysY&dT(bJNIj_w;*)aRi zzZC0^#+|(I{K?VRGqT@16x+A|Q=>x8ZyWvXnipR_7jf#R`Jee*&xvU~_4*g4CH>#; zvE^LU{^uvOuKsk);g1z#+DEASd4!(2-0%t8D{-F6oCC}K^)Z24J&sO0(q9;T=E0D$ zp1+j5Z@k^&>-cK(#81AB)&~~GZyGWrC39p(PK57?S#Rs!`mwqG#?-1eyN_D8SC_MU z%!{KoH28Y@`t3hF++$0F23=?8^!YN}?CDb;TYc){wnuzAzBgpm(Zxsh?SJLS(H(=| zoAl=HfQ*-p54-g5$e*LqBSze+`nK?!Cnvv|JiOiEsDEx=VA$P$SQ!lO<)g$CP|F2$KaQz>`=+R{hCNG_T+0yKr zjEkS2Q$PDs&&wsnA6>`_d}iRNm!sn%J_+p{eWAyQ8;Q+3oSG4D-uZm>!V{Y8*cn~2 z8=P1deWKTy!{OEYRz6Tte6*_P6UlGco(^d8}GW6GqmB+){Yz#RvXLv*9 zFO34%jeqLc%VnD)X0_0?P9XnXq#u27?c;04&wb^g zS#R}h`Qxhxrd+$PhyLcWpZCYV=cE7jQsr~sq$w&k%xSQ+kN4~qBR>6U+(?`8pNpUS z@@Ke#H9dmB!}FHDjlr}H{Dm%VgR1BTaqqy%FOb}#^oAcqH=lXyk#b+N`N)=_XF7lR z*rBzbd(Qm2S77hK;rCzuK5$)xukumt_7@FXUwCO>v;5evVv1I`_U%{o{G>%Yi#K0> zDn2VDW6p{Gy7l89dCJ`9`{2T7hILKv9G=xx8MDyRsLRm#OAgJ=`DOaLzAK&{{&m=j zf2#IwJ(O~*blS43ixpFMhrgE>G^LWYT9CP@dc$vhThA$s>Jak4?fX^M6SKU#zP#ta znU0HBr+Muky=BXXr_nI6=?afXOs?v3l9A16tHou`9Og>E-Enl>9SMFqxsmQSx;tWy z8$;l*J2_;cbLPZ;cS`PFk%Q8a+9+`!$(@F~5;$mdMP$smHov11nZvvh6{TQ|H+ulB z??|3B8!#9?nLyu;!`oT|!(x=tVfauHz7U7ks|JSU<|Ie<4a28W@Yb5vh_Cq$46D#v z!v+ruY&byMFW+cM*Omw%V6yhp4wS#8YQ>A=#agQpUm-N%8hwIAYs0CfFiL09(ie1# zjZv}D(Y>Sa_N*yP$SaH)7-qD^gbB9EIMPkTQC?vzrNvSjj&u`oM4yK=*^omzkW+A7 zOtI(bDW~+5^Tf$1adJv8Ii;7J(pygH9VQgvg9Q`OBHD$~yVogVf_xMfH7I~ljHej$ zYcZBZF*X}!loqxZQ#^XZ;(Q>UmUp5@t}0@6E!%0&4gPgs_4WF_tGjXai@9DyT*;oS zk#ep^3f++e&`bkUZP2zKOinb8sr$Sgf#%C*CvF`!U z6qnkch)lH7oK_ptic`Mo5UY%6^+{-ixRL;i4;dC5^8aapr`4Pe<`4XTR0!CI0v!RO zNvqaqJ+0SDr;u5c9o) zT8cMdL*p!%Euz5sSXl|BT{0`#In0C>^}$OT3Kvw+#a z0^kK;C-4E#1rqKC(2ED_fYA^_L$s0F5(qQ~f`Fy~ElvvtLV#GHCs5Xj{=I?se;Ey+ z6E#6{pe_{w!~-h83X}qGLBtz@N(ku~kd3~Y3+x6y0xE%HK!O*#J5U4}O#o~_8SpT8 zPVykx_X7?Bt)WQG!0TY36VL_t+Kt~bA3p~#D}ld(pS$pAq7pa;%tNnx7T5&511v%h zTnt2_pGN@$fI+|)z$xG|@B{EX`u_{SQuP1jz+fo53iuHHd=Jne4CVlc>X5zf#?p9(;*gZP$ug4r{gl%k@4r zuLW^yL59+O^M^%jJd+;2K_~WjQ$C-uIHom5nxP7WzZE=)eg^`5y_(Eic$*VAkcF)H@1@qY6KFK8{acOQkvRHoCyMn zl%{`(A1#CJWY#wWnU=vd8gnOpTxww_F+Bt%F14sZLga@-bsZx1;lg#&kG@_YA?imj zUj<}JDy`&F_2j>fd+ErN-#-;^z0MT%YpMeGPgRV^xy#KhXy}zTda_DeFu$3e-b%{J zHES6j;kqd2qeYbYu@tJmq^9#IMcia!tvIm~Upq0bUnRb^h{c(pq_DLR6l)jnZzssL zE#BXayUxTUQPoK-sF9r**Snxbf1;^pg6zb&2A&D}lTBUU%ubN&;`(NPqN&qE?8LZM zP7nDLP0eg&Czja?`jy%0Pc~I(YbPkQ1wo5y7swU~Itaf%&FDHepY64#a zFUHa;LwXB+-M|)%7Ut6v&I$bmzVhQYTPLO+sboHpQH)-qqGj%Jiu>5|;C_r=d}+~c z5Th4?EdDA+FEyTTlfj?s(VGwS;!mU*}Kz==DBYL{AIAX?fUf*Jj3&C zu#co9o?o}`7x4rS*@)3JXFN><;wd{7(}3k-DmoD`vvT}WDv3ef#9Wk{rZ>_g#_5=% z0Ye(i11S#=A5U*DZx2te>hA1}YBJkFvkTgILD=VD%1e)y#f1%elt&Mk>4CErB{SB` zXC7AvIGN6yTzN{HPCwarz)z2|&3MobUt28Q-^a^?HePySEv2h4#WtWMx@T0=d5Uzo zdN_x>>#`ANXv;utsg|QqmW3}QxKkVMLmP-wavSQROb_Otk;_Snq|l`>^Gs1r{%}Y^l_c;b?~iVF%)~7G?EvyRWWV(sG_6NkZ_f#k(4K`gwSG zdwA8!X<7h43q1_HR9Xl`Tx6_2?ssqo3F0PKmd>Wr&j`+wLe!UdQ^;p7DzR)YZ#2}$ z6D{>%XR5=oib2}smL{Ucq#L5F1(uF;8`^*u6kuX@G2fQcZyb`{UDi9t<5dcDbY5|M zwm+ZtYazH*m_2x59!Luj7cP?)2FNH0d(z~Z%QUz@VkQ9kgsAeX0NCd?CC>wfrbL82mhy=4w5qcP`4IQ zjvz#ncvOXFpNwCEye*Rq29K^H@f1*9O*%+hFjKQguFD2kjebnPUpgXgdh`xjFk*v6 zjECMEa3pSR_|mt0Xg*mNTEJDv7sSyLI$9JaE; zc@!c|%!WOpc5FKKN&K&i^c4DlyR9M~ zUk^k&g-higOvj|=IgWomm4@$^;ve+kn|>PprT-fFsa_fM{KBZznQg`VI@JXqA(p;D zY_{OkeqRih8Or6P8}jjAH1OY=@TtCheA3U1fA%l}-}qlw_KOQrE&D_xa z3V!_I&(Al%bU-|AyZ=S{ufoJZnUr8>N!x)fqD+qbD*9B^&F_@Ks^WQ zIZ)4mdJfccpq>MDaiFgGKg~xyDi2mJQML)1{{rU!5x=ZULYlW*nhtYtn$Oex7&{)8 zrMY?uM{{G9w5PZOol`JUoHIA#^*~ac`}U|HwTWbeJI7_FCXGv>|45(B(Z!i!=Up5D8q@M0;TN3L-$eMNeGbTTPnt*4b7h)C(ma!%+tV|9dR|Y@ z=xMG_^HZ9W(=&OR3KXYL~RYYztKhWQ~?MDXujGJ z2nTLsbxvpOG3}-AMe*;vbjQ925CKF2N+1e|2Jo1jFWBjceH_pW=neD%XpT*5PWu7< zfdK%~AA~*D1F?8O1tb880L3@xdNTGyfD|AV7zzvn(g3>t8=%2)rgTne+0ro%|56(* z)}$|730RIq-v_c_NzuJ*A5KB^w%={}YARSNKOIX!&8P#lRSRB-@6GBd$6al$5lxfJC%aYKHuA?+T<@A&-sF4d zJO@}M`xq7^19}+8_G9sKC2DaPc0q&wJ(L~AqP#Egdm1SN@#nt9GKIbUWHW2q-jd2U zOsZLaDq5_E^#ADAP~tz?>bvDXl|}JY+A^VaX2c0;F)?WyE#|VE~Xg`snaqDi|PvhBm;6Js=`ggbG|EJ28>HL}UiFaf_$Q;u+Mtr1x zl2v} z6CE9??wNRBQr|>%Lht0nX)+^q)<4?(eEb7ubMeSxW7wbpC|>;BMwY(7>Uv65E43K@ zXelo8iUOD2ilTEDR@^o7At42ojCA+0C=Y|YybcN literal 0 HcmV?d00001 diff --git a/licensing/LICENSE b/licensing/LICENSE new file mode 100644 index 000000000..648ec8fc3 --- /dev/null +++ b/licensing/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2012 The Broad Institute + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R new file mode 100644 index 000000000..88fc48e2a --- /dev/null +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R @@ -0,0 +1,236 @@ +library(gplots) +library(ggplot2) + +# ------------------------------------------------------- +# Utilities for displaying multiple plots per page +# ------------------------------------------------------- + +distributeGraphRows <- function(graphs, heights = c()) { + # Viewport layout 2 graphs top to bottom with given relative heights + # + # + if (length(heights) == 0) { + heights <- rep.int(1, length(graphs)) + } + heights <- heights[!is.na(graphs)] + graphs <- graphs[!is.na(graphs)] + numGraphs <- length(graphs) + Layout <- grid.layout(nrow = numGraphs, ncol = 1, heights=heights) + grid.newpage() + pushViewport(viewport(layout = Layout)) + subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1) + for (i in 1:numGraphs) { + print(graphs[[i]], vp = subplot(i)) + } +} + +distributeLogGraph <- function(graph, xName) { + continuousGraph <- graph + scale_x_continuous(xName) + logGraph <- graph + scale_x_log10(xName) + opts(title="") + distributeGraphRows(list(continuousGraph, logGraph)) +} + +distributePerSampleGraph <- function(perSampleGraph, distGraph, ratio=c(2,1)) { + distributeGraphRows(list(perSampleGraph, distGraph), ratio) +} + +removeExtraStrats <- function(variantEvalDataFrame, moreToRemove=c()) { + # Remove the standard extra stratification columns FunctionalClass, Novelty, and others in moreToRemove from the variantEvalDataFrame + # + # Only keeps the column marked with "all" for each removed column + # + for ( toRemove in c("FunctionalClass", "Novelty", moreToRemove) ) { + if (toRemove %in% colnames(variantEvalDataFrame)) { + variantEvalDataFrame <- variantEvalDataFrame[variantEvalDataFrame[[toRemove]] == "all",] + } + } + variantEvalDataFrame +} + +openPDF <- function(outputPDF) { + # Open the outputPDF file with standard dimensions, if outputPDF is not NA + if ( ! is.na(outputPDF) ) { + pdf(outputPDF, height=8.5, width=11) + } +} + +closePDF <- function(outputPDF) { + # close the outputPDF file if not NA, and try to compact the PDF if possible + if ( ! is.na(outputPDF) ) { + dev.off() + if (exists("compactPDF")) { + compactPDF(outputPDF) + } + } +} + +makeRatioDataFrame <- function(ACs, num, denom, widths = NULL) { + if ( is.null(widths) ) widths <- rep(1, length(ACs)) + + value = NULL + titv <- data.frame(AC=ACs, width = widths, num=num, denom = denom, ratio = num / denom) +} + +.reduceACs <- function(binWidthForAC, ACs) { + # computes data structures necessary to reduce the full range of ACs + # + # binWidthForAC returns the number of upcoming bins that should be merged into + # that AC bin. ACs is a vector of all AC values from 0 to 2N that should be + # merged together + # + # Returns a list containing the reduced ACs starts, their corresponding widths, + # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc) + maxAC <- max(ACs) + newACs <- c() + widths <- c() + newACMap <- c() + ac <- 0 + while ( ac < maxAC ) { + newACs <- c(newACs, ac) + width <- binWidthForAC(ac) + widths <- c(widths, width) + newACMap <- c(newACMap, rep(ac, width)) + ac <- ac + width + } + list(ACs = newACs, widths=widths, newACMap = newACMap) +} + +# geometricACs <- function(k, ACs) { +# nBins <- round(k * log10(max(ACs))) +# +# binWidthForAC <- function(ac) { +# max(ceiling(ac / nBins), 1) +# } +# +# return(reduceACs(binWidthForAC, ACs)) +# } + +reduce.AC.on.LogLinear.intervals <- function(scaleFactor, ACs) { + # map the full range of AC values onto a log linear scale + # + # Reduce the full AC range onto one where the width of each new AC increases at a rate of + # 10^scaleFactor in size with growing AC values. This is primarily useful for accurately + # computing ratios or other quantities by AC that aren't well determined when the AC + # values are very large + # + # Returns a list containing the reduced ACs starts, their corresponding widths, + # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc) + maxAC <- max(ACs) + afs <- ACs / maxAC + breaks <- 10^(seq(-4, -1, scaleFactor)) + widths <- c() + lastBreak <- 1 + for ( i in length(breaks):1 ) { + b <- breaks[i] + width <- sum(afs < lastBreak & afs >= b) + widths <- c(widths, width) + lastBreak <- b + } + widths <- rev(widths) + + binWidthForAC <- function(ac) { + af <- ac / maxAC + value = 1 + for ( i in length(breaks):1 ) + if ( af >= breaks[i] ) { + value = widths[i] + break + } + + return(value) + } + + return(.reduceACs(binWidthForAC, ACs)) +} + +.remapACs <- function(remapper, k, df) { + newACs <- remapper(k, df$AC) + + n = length(newACs$ACs) + num = rep(0, n) + denom = rep(0, n) + for ( i in 1:dim(df)[1] ) { + rowI = df$AC == i + row = df[rowI,] + newAC = newACs$newACMap[row$AC] + newRowI = newACs$ACs == newAC + num[newRowI] = num[newRowI] + df$num[rowI] + denom[newRowI] = denom[newRowI] + df$denom[rowI] + } + + newdf <- makeRatioDataFrame(newACs$ACs, num, denom, newACs$widths ) + newdf +} + +compute.ratio.on.LogLinear.AC.intervals <- function(ACs, num, denom, scaleFactor = 0.1) { + df = makeRatioDataFrame(ACs, num, denom, 1) + return(.remapACs(reduce.AC.on.LogLinear.intervals, scaleFactor, df)) +} + +plotVariantQC <- function(metrics, measures, requestedStrat = "Sample", + fixHistogramX=F, anotherStrat = NULL, nObsField = "n_indels", + onSamePage=F, facetVariableOnXPerSample = F, facetVariableOnXForDist = T, moreTitle="") { + metrics$strat = metrics[[requestedStrat]] + + otherFacet = "." + id.vars = c("strat", "nobs") + metrics$nobs <- metrics[[nObsField]] + + # keep track of the other strat and it's implied facet value + if (! is.null(anotherStrat)) { + id.vars = c(id.vars, anotherStrat) + otherFacet = anotherStrat + } + + molten <- melt(metrics, id.vars=id.vars, measure.vars=c(measures)) + perSampleGraph <- ggplot(data=molten, aes(x=strat, y=value, group=variable, color=variable, fill=variable)) + title <- opts(title=paste(paste(paste(measures, collapse=", "), "by", requestedStrat), moreTitle)) + + determineFacet <- function(onX) { + if ( onX ) { + paste(otherFacet, "~ variable") + } else { + paste("variable ~", otherFacet) + } + } + + sampleFacet = determineFacet(facetVariableOnXPerSample) + distFacet = determineFacet(facetVariableOnXForDist) + + if ( requestedStrat == "Sample" ) { + perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale + perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "") + } else { + perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs))) + perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount") + } + perSampleGraph <- perSampleGraph + ylab("Variable value") + title + perSampleGraph <- perSampleGraph + facet_grid(sampleFacet, scales="free") + + nValues = length(unique(molten$value)) + if (nValues > 2) { + if ( requestedStrat == "Sample" ) { + distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable)) + } else { + distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable, weight=nobs)) + } + distGraph <- distGraph + geom_histogram(aes(y=..ndensity..)) + distGraph <- distGraph + geom_density(alpha=0.5, aes(y=..scaled..)) + distGraph <- distGraph + geom_rug(aes(y=NULL, color=variable, position="jitter")) + scale = "free" + if ( fixHistogramX ) scale = "fixed" + distGraph <- distGraph + facet_grid(distFacet, scales=scale) + distGraph <- distGraph + ylab("Relative frequency") + distGraph <- distGraph + xlab("Variable value (see facet for variable by color)") + distGraph <- distGraph + opts(axis.text.x=theme_text(angle=-45)) # , legend.position="none") + } else { + distGraph <- NA + } + + if ( onSamePage ) { + suppressMessages(distributePerSampleGraph(perSampleGraph, distGraph)) + } else { + suppressMessages(print(perSampleGraph)) + suppressMessages(print(distGraph + title)) + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 1cea14a9d..b821b98e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -18,10 +18,7 @@ import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.Queue; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.FutureTask; +import java.util.concurrent.*; /** * A microscheduler that schedules shards according to a tree-like structure. @@ -44,11 +41,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar private final Queue reduceTasks = new LinkedList(); - /** - * An exception that's occurred in this traversal. If null, no exception has occurred. - */ - private RuntimeException error = null; - /** * Queue of incoming shards. */ @@ -99,11 +91,13 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar ReduceTree reduceTree = new ReduceTree(this); initializeWalker(walker); + // + // exception handling here is a bit complex. We used to catch and rethrow exceptions all over + // the place, but that just didn't work well. Now we have a specific execution exception (inner class) + // to use for multi-threading specific exceptions. All RuntimeExceptions that occur within the threads are rethrown + // up the stack as their underlying causes + // while (isShardTraversePending() || isTreeReducePending()) { - // Check for errors during execution. - if(hasTraversalErrorOccurred()) - throw getTraversalError(); - // Too many files sitting around taking up space? Merge them. if (isMergeLimitExceeded()) mergeExistingOutput(false); @@ -130,12 +124,8 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar result = reduceTree.getResult().get(); notifyTraversalDone(walker,result); } - catch (ReviewedStingException ex) { - throw ex; - } - catch (Exception ex) { - throw new ReviewedStingException("Unable to retrieve result", ex); - } + catch( InterruptedException ex ) { handleException(ex); } + catch( ExecutionException ex ) { handleException(ex); } // do final cleanup operations outputTracker.close(); @@ -338,32 +328,41 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar } /** - * Detects whether an execution error has occurred. - * @return True if an error has occurred. False otherwise. + * Handle an exception that occurred in a worker thread as needed by this scheduler. + * + * The way to use this function in a worker is: + * + * try { doSomeWork(); + * catch ( InterruptedException ex ) { hms.handleException(ex); } + * catch ( ExecutionException ex ) { hms.handleException(ex); } + * + * @param ex the exception that occurred in the worker thread */ - private synchronized boolean hasTraversalErrorOccurred() { - return error != null; - } - - private synchronized RuntimeException getTraversalError() { - if(!hasTraversalErrorOccurred()) - throw new ReviewedStingException("User has attempted to retrieve a traversal error when none exists"); - return error; + protected final void handleException(InterruptedException ex) { + throw new HierarchicalMicroScheduler.ExecutionFailure("Hierarchical reduce interrupted", ex); } /** - * Allows other threads to notify of an error during traversal. + * Handle an exception that occurred in a worker thread as needed by this scheduler. + * + * The way to use this function in a worker is: + * + * try { doSomeWork(); + * catch ( InterruptedException ex ) { hms.handleException(ex); } + * catch ( ExecutionException ex ) { hms.handleException(ex); } + * + * @param ex the exception that occurred in the worker thread */ - protected synchronized void notifyOfTraversalError(Throwable error) { - // If the error is already a Runtime, pass it along as is. Otherwise, wrap it. - if (error instanceof RuntimeException) - this.error = (RuntimeException)error; + protected final void handleException(ExecutionException ex) { + if ( ex.getCause() instanceof RuntimeException ) + // if the cause was a runtime exception that's what we want to send up the stack + throw (RuntimeException )ex.getCause(); else - this.error = new ReviewedStingException("An error occurred during the traversal.", error); - + throw new HierarchicalMicroScheduler.ExecutionFailure("Hierarchical reduce failed", ex); } + /** A small wrapper class that provides the TreeReducer interface along with the FutureTask semantics. */ private class TreeReduceTask extends FutureTask { private TreeReducer treeReducer = null; @@ -382,6 +381,17 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar } } + /** + * A specific exception class for HMS-specific failures such as + * Interrupted or ExecutionFailures that aren't clearly the fault + * of the underlying walker code + */ + public static class ExecutionFailure extends ReviewedStingException { + public ExecutionFailure(final String s, final Throwable throwable) { + super(s, throwable); + } + } + /** * Used by the ShardTraverser to report time consumed traversing a given shard. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index badd39860..9920213a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -27,16 +27,15 @@ import java.util.concurrent.Callable; * Carries the walker over a given shard, in a callable interface. */ public class ShardTraverser implements Callable { - private HierarchicalMicroScheduler microScheduler; - private Walker walker; - private Shard shard; - private TraversalEngine traversalEngine; - private ThreadLocalOutputTracker outputTracker; + final private HierarchicalMicroScheduler microScheduler; + final private Walker walker; + final private Shard shard; + final private TraversalEngine traversalEngine; + final private ThreadLocalOutputTracker outputTracker; private OutputMergeTask outputMergeTask; /** our log, which we want to capture anything from this class */ - protected static Logger logger = Logger.getLogger(ShardTraverser.class); - + final protected static Logger logger = Logger.getLogger(ShardTraverser.class); /** * Is this traversal complete? @@ -58,11 +57,10 @@ public class ShardTraverser implements Callable { public Object call() { try { traversalEngine.startTimersIfNecessary(); - long startTime = System.currentTimeMillis(); + final long startTime = System.currentTimeMillis(); Object accumulator = walker.reduceInit(); - LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(), + final WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(), microScheduler.getReadIterator(shard), shard.getGenomeLocs(), microScheduler.engine.getSampleDB().getSampleNames()); // todo: microScheduler.engine is protected - is it okay to user it here? @@ -76,18 +74,12 @@ public class ShardTraverser implements Callable { windowMaker.close(); outputMergeTask = outputTracker.closeStorage(); - long endTime = System.currentTimeMillis(); + final long endTime = System.currentTimeMillis(); microScheduler.reportShardTraverseTime(endTime-startTime); return accumulator; - } - catch(Throwable t) { - // Notify that an exception has occurred and rethrow it. - microScheduler.notifyOfTraversalError(t); - throw new ReviewedStingException("An error has occurred during traversal",t); - } - finally { + } finally { synchronized(this) { complete = true; notifyAll(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java b/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java index 6acaadd50..fc8a89c64 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java @@ -25,20 +25,11 @@ import java.util.concurrent.Future; * interface to force the reduce. */ public class TreeReducer implements Callable { - private HierarchicalMicroScheduler microScheduler; + final private HierarchicalMicroScheduler microScheduler; private TreeReducible walker; private Future lhs; private Future rhs; - /** - * Create a one-sided reduce. Result will be a simple pass-through of the result. - * @param microScheduler The parent hierarchical microscheduler for this reducer. - * @param lhs The one side of the reduce. - */ - public TreeReducer( HierarchicalMicroScheduler microScheduler, Future lhs ) { - this( microScheduler, lhs, null ); - } - /** * Create a full tree reduce. Combine this two results using an unspecified walker at some point in the future. * @param microScheduler The parent hierarchical microscheduler for this reducer. @@ -67,10 +58,7 @@ public class TreeReducer implements Callable { if( lhs == null ) throw new IllegalStateException(String.format("Insufficient data on which to reduce; lhs = %s, rhs = %s", lhs, rhs) ); - if( rhs == null ) - return lhs.isDone(); - - return lhs.isDone() && rhs.isDone(); + return lhs.isDone() && (rhs == null || rhs.isDone()); } /** @@ -80,24 +68,21 @@ public class TreeReducer implements Callable { public Object call() { Object result = null; - long startTime = System.currentTimeMillis(); + final long startTime = System.currentTimeMillis(); try { if( lhs == null ) result = lhs.get(); + // todo -- what the hell is this above line? Shouldn't it be the two below? +// if( lhs == null ) +// throw new IllegalStateException(String.format("Insufficient data on which to reduce; lhs = %s, rhs = %s", lhs, rhs) ); else result = walker.treeReduce( lhs.get(), rhs.get() ); } - catch( InterruptedException ex ) { - microScheduler.notifyOfTraversalError(ex); - throw new ReviewedStingException("Hierarchical reduce interrupted", ex); - } - catch( ExecutionException ex ) { - microScheduler.notifyOfTraversalError(ex); - throw new ReviewedStingException("Hierarchical reduce failed", ex); - } + catch( InterruptedException ex ) { microScheduler.handleException(ex); } + catch( ExecutionException ex ) { microScheduler.handleException(ex); } - long endTime = System.currentTimeMillis(); + final long endTime = System.currentTimeMillis(); // Constituent bits of this tree reduces are no longer required. Throw them away. this.lhs = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index 82cb43634..94051cc7f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,7 +12,6 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND @@ -99,8 +98,13 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Create a new stub given the requested file. + * + * @param engine engine. * @param genotypeFile file to (ultimately) create. * @param isCompressed should we compress the output stream? + * @param argumentSources sources. + * @param skipWritingHeader skip writing header. + * @param doNotWriteGenotypes do not write genotypes. */ public VCFWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) { this.engine = engine; @@ -114,8 +118,13 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Create a new stub given the requested file. + * + * @param engine engine. * @param genotypeStream stream to (ultimately) write. * @param isCompressed should we compress the output stream? + * @param argumentSources sources. + * @param skipWritingHeader skip writing header. + * @param doNotWriteGenotypes do not write genotypes. */ public VCFWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, boolean isCompressed, Collection argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) { this.engine = engine; @@ -154,7 +163,7 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Gets the master sequence dictionary from the engine associated with this stub * @link GenomeAnalysisEngine.getMasterSequenceDictionary - * @return + * @return the master sequence dictionary from the engine associated with this stub */ public SAMSequenceDictionary getMasterSequenceDictionary() { return engine.getMasterSequenceDictionary(); @@ -188,22 +197,25 @@ public class VCFWriterStub implements Stub, VCFWriter { vcfHeader = header; // Check for the command-line argument header line. If not present, add it in. - if ( !skipWritingHeader ) { - VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine(); - boolean foundCommandLineHeaderLine = false; - for (VCFHeaderLine line: vcfHeader.getMetaData()) { - if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) ) - foundCommandLineHeaderLine = true; + if (!skipWritingHeader && header.isWriteEngineHeaders()) { + + if (header.isWriteCommandLine()) { + VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine(); + boolean foundCommandLineHeaderLine = false; + for (VCFHeaderLine line: vcfHeader.getMetaData()) { + if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) ) + foundCommandLineHeaderLine = true; + } + if ( !foundCommandLineHeaderLine ) + vcfHeader.addMetaDataLine(commandLineArgHeaderLine); } - if ( !foundCommandLineHeaderLine ) - vcfHeader.addMetaDataLine(commandLineArgHeaderLine); // also put in the reference contig header lines String assembly = getReferenceAssembly(engine.getArguments().referenceFile.getName()); for ( SAMSequenceRecord contig : engine.getReferenceDataSource().getReference().getSequenceDictionary().getSequences() ) vcfHeader.addMetaDataLine(getContigHeaderLine(contig, assembly)); - vcfHeader.addMetaDataLine(new VCFHeaderLine("reference", "file://" + engine.getArguments().referenceFile.getAbsolutePath())); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + engine.getArguments().referenceFile.getAbsolutePath())); } outputTracker.getStorage(this).writeHeader(vcfHeader); @@ -225,7 +237,7 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Gets a string representation of this object. - * @return + * @return a string representation of this object. */ @Override public String toString() { @@ -247,20 +259,20 @@ public class VCFWriterStub implements Stub, VCFWriter { val = String.format("", contig.getSequenceName(), contig.getSequenceLength(), assembly); else val = String.format("", contig.getSequenceName(), contig.getSequenceLength()); - return new VCFHeaderLine("contig", val); + return new VCFHeaderLine(VCFHeader.CONTIG_KEY, val); } private String getReferenceAssembly(String refPath) { // This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot String assembly = null; - if ( refPath.indexOf("b37") != -1 || refPath.indexOf("v37") != -1 ) + if (refPath.contains("b37") || refPath.contains("v37")) assembly = "b37"; - else if ( refPath.indexOf("b36") != -1 ) + else if (refPath.contains("b36")) assembly = "b36"; - else if ( refPath.indexOf("hg18") != -1 ) + else if (refPath.contains("hg18")) assembly = "hg18"; - else if ( refPath.indexOf("hg19") != -1 ) + else if (refPath.contains("hg19")) assembly = "hg19"; return assembly; } -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 0e13e4ad9..2c2ee51bb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -47,6 +47,14 @@ public class RefMetaDataTracker { // // ------------------------------------------------------------------------------------------ + /** + * Only for testing -- not accesssible in any other context + */ + public RefMetaDataTracker() { + ref = null; + map = Collections.emptyMap(); + } + public RefMetaDataTracker(final Collection allBindings, final ReferenceContext ref) { this.ref = ref; diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 58002bd14..6551bf376 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -250,53 +250,40 @@ public class GATKReportTable { } /** - * Returns the first primary key matching the dotted column values. - * Ex: dbsnp.eval.called.all.novel.all - * - * @param dottedColumnValues Period concatenated values. + * Returns the first primary key matching the column values. + * Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all" + * @param columnValues column values. * @return The first primary key matching the column values or throws an exception. */ - public Object getPrimaryKeyByData(String dottedColumnValues) { - Object key = findPrimaryKey(dottedColumnValues); + public Object getPrimaryKeyByData(Object... columnValues) { + Object key = findPrimaryKeyByData(columnValues); if (key == null) - throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + dottedColumnValues); + throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + Arrays.asList(columnValues)); return key; } - /** - * Returns true if there is at least on row with the dotted column values. - * Ex: dbsnp.eval.called.all.novel.all - * - * @param dottedColumnValues Period concatenated values. - * @return true if there is at least one row matching the columns. - */ - public boolean containsPrimaryKey(String dottedColumnValues) { - return findPrimaryKey(dottedColumnValues) != null; - } - - /** - * Returns the first primary key matching the dotted column values. - * Ex: dbsnp.eval.called.all.novel.all - * - * @param dottedColumnValues Period concatenated values. - * @return The first primary key matching the column values or null. - */ - private Object findPrimaryKey(String dottedColumnValues) { - return findPrimaryKey(dottedColumnValues.split("\\.")); - } - /** * Returns the first primary key matching the column values. - * Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" } + * Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all" * * @param columnValues column values. - * @return The first primary key matching the column values. + * @return The first primary key matching the column values or null if the key does not exist. */ - private Object findPrimaryKey(Object[] columnValues) { + public Object findPrimaryKeyByData(Object... columnValues) { + if (columnValues == null) + throw new NullPointerException("Column values is null"); + if (columnValues.length == 0) + throw new IllegalArgumentException("Column values is empty"); + int columnCount = columns.size(); for (Object primaryKey : primaryKeyColumn) { boolean matching = true; - for (int i = 0; matching && i < columnValues.length; i++) { - matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i + 1)); + // i --> index into columnValues parameter + // j --> index into columns collection + for (int i = 0, j = 0; matching && i < columnValues.length && j < columnCount; j++) { + if (!columns.getByIndex(j).isDisplayable()) + continue; + matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i)); + i++; } if (matching) return primaryKey; @@ -360,8 +347,8 @@ public class GATKReportTable { * output file), and the format string used to display the data. * * @param columnName the name of the column - * @param defaultValue the default value of a blank cell - * @param display if true - the column will be displayed; if false - the column will be hidden + * @param defaultValue if true - the column will be displayed; if false - the column will be hidden + * @param display display the column * @param format the format string used to display data */ public void addColumn(String columnName, Object defaultValue, boolean display, String format) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 22d23f216..76c1ce8c5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -47,6 +47,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension ); + final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); // add active regions to queue of regions to process workQueue.addAll( activeRegions ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java index bb007893c..d27148884 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java @@ -16,4 +16,5 @@ import java.lang.annotation.RetentionPolicy; public @interface ActiveRegionExtension { public int extension() default 0; + public int maxRegion() default 1500; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 8ff4b2f6f..f217268d2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -7,10 +7,7 @@ import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; -import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -33,8 +30,8 @@ import java.util.List; @By(DataSource.READS) @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.READ) -@ActiveRegionExtension(extension=50) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +@ActiveRegionExtension(extension=50,maxRegion=1500) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) public abstract class ActiveRegionWalker extends Walker { @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java index ab1e452d7..0777037bf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java @@ -127,7 +127,7 @@ public class FlagStatWalker extends ReadWalker { if (read.getDuplicateReadFlag()) { myStat.duplicates++; } - if (read.getReferenceIndex() >= 0) { + if (!read.getReadUnmappedFlag()) { myStat.mapped++; } if (read.getReadPairedFlag()) { @@ -139,21 +139,21 @@ public class FlagStatWalker extends ReadWalker { myStat.read1++; } if (read.getProperPairFlag()) { - myStat.properly_paired++; } - if (!read.getMateUnmappedFlag() && read.getReferenceIndex() >= 0) { + if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) { myStat.with_itself_and_mate_mapped++; - } - if (read.getMateUnmappedFlag()) { - myStat.singletons++; - } - } - if (read.getReferenceIndex() >= 0 && read.getMateReferenceIndex() >= 0 && ! read.getReferenceIndex().equals(read.getMateReferenceIndex())) { - myStat.with_mate_mapped_to_a_different_chr++; - if (read.getMappingQuality() >= 5) { - myStat.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++; + if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) { + myStat.with_mate_mapped_to_a_different_chr++; + + if (read.getMappingQuality() >= 5) { + myStat.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++; + } + } + } + if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) { + myStat.singletons++; } } return 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 97a4ac468..6eea12e2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -5,12 +5,10 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** @@ -31,8 +29,31 @@ public class BaseQualityRankSumTest extends RankSumTest { altQuals.add((double)p.getQual()); } } - } + protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + // TODO -- implement me; how do we pull out the correct offset from the read? + return; + +/* + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + + if ( isUsableBase(p) ) { + if ( matchesRef ) + refQuals.add((double)p.getQual()); + else + altQuals.add((double)p.getQual()); + } + } + } +*/ + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 0acd3e841..b3a8dbebd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -35,6 +36,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -49,7 +52,7 @@ import java.util.Map; * allele Frequency, for each ALT allele, in the same order as listed; total number * of alleles in called genotypes. */ -public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation { +public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), @@ -63,6 +66,13 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( ! vc.hasGenotypes() ) + return null; + + return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); + } + public List getKeyNames() { return Arrays.asList(keyNames); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index b744fec46..f94d48893 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -33,7 +36,7 @@ import java.util.Map; * Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with * -dcov D is N * D */ -public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation { +public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -47,6 +50,22 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + int depth = 0; + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final List alleleBin : alleleBins.values() ) { + depth += alleleBin.size(); + } + } + + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%d", depth)); + return map; + } + public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 817d6b1ff..0d3bd11a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -28,6 +28,7 @@ import cern.jet.math.Arithmetic; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -37,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -49,7 +51,7 @@ import java.util.*; * indicative of false positive calls. Note that the fisher strand test may not be * calculated for certain complex indel cases or for multi-allelic sites. */ -public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation { +public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; @@ -78,6 +80,22 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( !vc.isVariant() ) + return null; + + int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + + Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); + if ( pvalue == null ) + return null; + + Map map = new HashMap(); + map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); + return map; + + } + public List getKeyNames() { return Arrays.asList(FS); } @@ -193,6 +211,38 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return sum; } + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + private static int[][] getContingencyTable(Map>> stratifiedContexts, Allele ref, Allele alt) { + int[][] table = new int[2][2]; + + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alt.equals(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + boolean isFW = read.getReadNegativeStrandFlag(); + + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column]++; + } + } + } + + return table; + } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -214,8 +264,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat Allele base = Allele.create(p.getBase(), false); boolean isFW = !p.getRead().getReadNegativeStrandFlag(); - boolean matchesRef = ref.equals(base, true); - boolean matchesAlt = alt.equals(base, true); + final boolean matchesRef = ref.equals(base, true); + final boolean matchesAlt = alt.equals(base, true); if ( matchesRef || matchesAlt ) { int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; @@ -227,6 +277,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return table; } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 6366890d5..57561a277 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -27,12 +30,19 @@ import java.util.Map; * more information. Note that the Inbreeding Coefficient will not be calculated for files * with fewer than a minimum (generally 10) number of samples. */ -public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation { +public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final int MIN_SAMPLES = 10; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + return calculateIC(vc); + } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + return calculateIC(vc); + } + + private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index aa4f26ef3..520b0f232 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -6,12 +6,10 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** @@ -35,6 +33,23 @@ public class MappingQualityRankSumTest extends RankSumTest { } } } + + protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + if ( matchesRef ) + refQuals.add((double)read.getMappingQuality()); + else + altQuals.add((double)read.getMappingQuality()); + } + } + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index bf60dec6b..24a107235 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -3,11 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -23,7 +26,7 @@ import java.util.Map; * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing * reads associated with the samples with polymorphic genotypes. */ -public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation { +public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -62,4 +65,40 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + final GenotypesContext genotypes = vc.getGenotypes(); + if ( genotypes == null || genotypes.size() == 0 ) + return null; + + int depth = 0; + + for ( final Genotype genotype : genotypes ) { + + // we care only about variant calls with likelihoods + if ( !genotype.isHet() && !genotype.isHomVar() ) + continue; + + final Map> alleleBins = stratifiedContexts.get(genotype.getSampleName()); + if ( alleleBins == null ) + continue; + + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + if ( !alleleBin.getKey().equals(Allele.NO_CALL) ) + depth += alleleBin.getValue().size(); + } + } + + if ( depth == 0 ) + return null; + + double QD = -10.0 * vc.getLog10PError() / (double)depth; + + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", QD)); + return map; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 50ade5334..97c15e747 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -13,6 +14,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -24,7 +27,7 @@ import java.util.Map; /** * Root Mean Square of the mapping quality of the reads across all samples. */ -public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation { +public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -34,7 +37,7 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn for ( AlignmentContext context : stratifiedContexts.values() ) totalSize += context.size(); - int[] qualities = new int[totalSize]; + final int[] qualities = new int[totalSize]; int index = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { @@ -54,6 +57,35 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + int depth = 0; + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + depth += alleleBin.getValue().size(); + } + } + + final int[] qualities = new int[depth]; + int index = 0; + + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final List reads : alleleBins.values() ) { + for ( final GATKSAMRecord read : reads ) { + if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) + qualities[index++] = read.getMappingQuality(); + } + } + } + + double rms = MathUtils.rms(qualities); + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", rms)); + return map; + } + public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index ff5f8f144..80d248ac2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -12,6 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; @@ -26,7 +28,7 @@ import java.util.Map; /** * Abstract root for all RankSum based annotations */ -public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation { +public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { static final double INDEL_LIKELIHOOD_THRESH = 0.1; static final boolean DEBUG = false; @@ -38,7 +40,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (genotypes == null || genotypes.size() == 0) return null; - final ArrayList refQuals = new ArrayList(); final ArrayList altQuals = new ArrayList(); @@ -104,12 +105,52 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (!Double.isNaN(testResults.first)) map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); return map; - } - protected abstract void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals); + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if (stratifiedContexts.size() == 0) + return null; - protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals); + final GenotypesContext genotypes = vc.getGenotypes(); + if (genotypes == null || genotypes.size() == 0) + return null; + + final ArrayList refQuals = new ArrayList(); + final ArrayList altQuals = new ArrayList(); + + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + final Map> context = stratifiedContexts.get(genotype.getSampleName()); + if ( context == null ) + continue; + + fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), context, refQuals, altQuals); + } + + if ( refQuals.size() == 0 || altQuals.size() == 0 ) + return null; + + final MannWhitneyU mannWhitneyU = new MannWhitneyU(); + for (final Double qual : altQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); + } + for (final Double qual : refQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); + } + + // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) + final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); + + final Map map = new HashMap(); + if (!Double.isNaN(testResults.first)) + map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); + return map; + } + + protected abstract void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals); + + protected abstract void fillQualsFromPileup(final byte ref, final List alts, final ReadBackedPileup pileup, final List refQuals, final List altQuals); + + protected abstract void fillIndelQualsFromPileup(final ReadBackedPileup pileup, final List refQuals, final List altQuals); protected static boolean isUsableBase(final PileupElement p) { return !(p.isInsertionAtBeginningOfRead() || diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index a998cd08b..e013f0e08 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -11,12 +11,10 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). @@ -49,6 +47,27 @@ public class ReadPosRankSumTest extends RankSumTest { } } + protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + // TODO -- implement me; how do we pull out the correct offset from the read? + return; + +/* + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + if ( matchesRef ) + refQuals.add((double)read.getMappingQuality()); + else + altQuals.add((double)read.getMappingQuality()); + } + } +*/ + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 90d0ad740..413c32a24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -33,10 +33,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -94,6 +92,13 @@ public class VariantAnnotatorEngine { initializeDBs(); } + // experimental constructor for active region traversal + public VariantAnnotatorEngine(GenomeAnalysisEngine toolkit) { + this.walker = null; + this.toolkit = toolkit; + requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(Arrays.asList("ActiveRegionBasedAnnotation"), Collections.emptyList()); + } + // select specific expressions to use public void initializeExpressions(List expressionsToUse) { // set up the expressions @@ -169,7 +174,7 @@ public class VariantAnnotatorEngine { this.requireStrictAlleleMatch = requireStrictAlleleMatch; } - public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences @@ -192,6 +197,20 @@ public class VariantAnnotatorEngine { return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); } + public VariantContext annotateContext(final Map>> stratifiedContexts, VariantContext vc) { + Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); + + // go through all the requested info annotationTypes + for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + Map annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc); + if ( annotationsFromCurrentType != null ) + infoAnnotations.putAll(annotationsFromCurrentType); + } + + // generate a new annotated VC + return new VariantContextBuilder(vc).attributes(infoAnnotations).make(); + } + private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java new file mode 100755 index 000000000..de61c7741 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java @@ -0,0 +1,18 @@ +package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; + +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.List; +import java.util.Map; + +// TODO -- make this an abstract class when we move away from InfoFieldAnnotation +public interface ActiveRegionBasedAnnotation extends AnnotationType { + // return annotations for the given contexts split by sample and then allele + public abstract Map annotate(final Map>> stratifiedContexts, final VariantContext vc); + + // return the descriptions used for the VCF INFO meta field + public abstract List getDescriptions(); +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index b6a40f167..d73b22664 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import net.sf.picard.util.PeekableIterator; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -32,8 +33,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocComparator; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -79,10 +78,7 @@ public class DiagnoseTargets extends LocusWalker implements Annotato private IntervalBinding intervalTrack = null; @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter vcfWriter = null; - - @Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false) - private int expandInterval = 50; + private VCFWriter vcfWriter = null; @Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false) private int minimumBaseQuality = 20; @@ -96,13 +92,11 @@ public class DiagnoseTargets extends LocusWalker implements Annotato @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false) private int maximumCoverage = 700; - private TreeSet intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them) private HashMap intervalMap = null; // interval => statistics - private Iterator intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome - private GenomeLoc currentInterval = null; // The "current" interval loaded - private IntervalStatistics currentIntervalStatistics = null; // The "current" interval being filled with statistics - private Set samples = null; // All the samples being processed - private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals) + private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome + private Set samples = null; // all the samples being processed + + private final Allele SYMBOLIC_ALLELE = Allele.create("
", false); // avoid creating the symbolic allele multiple times @Override public void initialize() { @@ -111,72 +105,22 @@ public class DiagnoseTargets extends LocusWalker implements Annotato if (intervalTrack == null) throw new UserException("This tool currently only works if you provide an interval track"); - parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below - - List originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided - intervalList = new TreeSet(new GenomeLocComparator()); intervalMap = new HashMap(); - for (GenomeLoc interval : originalList) - intervalList.add(interval); - //addAndExpandIntervalToMap(interval); + intervalListIterator = new PeekableIterator(intervalTrack.getIntervals(getToolkit()).listIterator()); - intervalListIterator = intervalList.iterator(); - - // get all of the unique sample names - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - // initialize the header - Set headerInfo = getHeaderInfo(); - - vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); - } - - /** - * Gets the header lines for the VCF writer - * - * @return A set of VCF header lines - */ - private Set getHeaderInfo() { - Set headerLines = new HashSet(); - - // INFO fields for overall data - headerLines.add(new VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - headerLines.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools")); - headerLines.add(new VCFInfoHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); - headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); - - // FORMAT fields for each genotype - headerLines.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools")); - headerLines.add(new VCFFormatHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); - - // FILTER fields - - for (CallableStatus stat : CallableStatus.values()) { - headerLines.add(new VCFHeaderLine(stat.name(), stat.description)); - } - - return headerLines; + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header + vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header } @Override public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); - while (currentInterval == null || currentInterval.isBefore(refLocus)) { // do this for first time and while currentInterval is behind current locus - if (!intervalListIterator.hasNext()) - return 0L; - if (currentInterval != null) - processIntervalStats(currentInterval, Allele.create(ref.getBase(), true)); + removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore + addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus - currentInterval = intervalListIterator.next(); - addAndExpandIntervalToMap(currentInterval); - currentIntervalStatistics = intervalMap.get(currentInterval); - } - - if (currentInterval.isPast(refLocus)) // skip if we are behind the current interval - return 0L; - - currentIntervalStatistics.addLocus(context); // Add current locus to stats + for (IntervalStatistics intervalStatistics : intervalMap.values()) + intervalStatistics.addLocus(context); // Add current locus to stats return 1L; } @@ -198,10 +142,15 @@ public class DiagnoseTargets extends LocusWalker implements Annotato return sum + value; } + /** + * Process all remaining intervals + * + * @param result number of loci processed by the walker + */ @Override public void onTraversalDone(Long result) { - for (GenomeLoc interval : intervalMap.keySet()) - processIntervalStats(interval, Allele.create("
", true)); + for (GenomeLoc interval : intervalMap.keySet()) + processIntervalStats(intervalMap.get(interval), Allele.create("A")); } @Override @@ -219,82 +168,111 @@ public class DiagnoseTargets extends LocusWalker implements Annotato @Override public boolean alwaysAppendDbsnpId() {return false;} - private GenomeLoc createIntervalBefore(GenomeLoc interval) { - int start = Math.max(interval.getStart() - expandInterval, 0); - int stop = Math.max(interval.getStart() - 1, 0); - return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); - } + /** + * Removes all intervals that are behind the current reference locus from the intervalMap + * + * @param refLocus the current reference locus + * @param refBase the reference allele + */ + private void removePastIntervals(GenomeLoc refLocus, byte refBase) { + List toRemove = new LinkedList(); + for (GenomeLoc interval : intervalMap.keySet()) + if (interval.isBefore(refLocus)) { + processIntervalStats(intervalMap.get(interval), Allele.create(refBase, true)); + toRemove.add(interval); + } - private GenomeLoc createIntervalAfter(GenomeLoc interval) { - int contigLimit = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(interval.getContigIndex()).getSequenceLength(); - int start = Math.min(interval.getStop() + 1, contigLimit); - int stop = Math.min(interval.getStop() + expandInterval, contigLimit); - return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); + for (GenomeLoc interval : toRemove) + intervalMap.remove(interval); + + GenomeLoc interval = intervalListIterator.peek(); // clean up all intervals that we might have skipped because there was no data + while(interval != null && interval.isBefore(refLocus)) { + interval = intervalListIterator.next(); + processIntervalStats(createIntervalStatistic(interval), Allele.create(refBase, true)); + interval = intervalListIterator.peek(); + } } /** - * Takes an interval and commits it to memory. - * It will expand it if so told by the -exp command line argument + * Adds all intervals that overlap the current reference locus to the intervalMap * - * @param interval The new interval to process + * @param refLocus the current reference locus */ - private void addAndExpandIntervalToMap(GenomeLoc interval) { - if (expandInterval > 0) { - GenomeLoc before = createIntervalBefore(interval); - GenomeLoc after = createIntervalAfter(interval); - intervalList.add(before); - intervalList.add(after); - intervalMap.put(before, new IntervalStatistics(samples, before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); - intervalMap.put(after, new IntervalStatistics(samples, after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + private void addNewOverlappingIntervals(GenomeLoc refLocus) { + GenomeLoc interval = intervalListIterator.peek(); + while (interval != null && !interval.isPast(refLocus)) { + System.out.println("LOCUS : " + refLocus + " -- " + interval); + intervalMap.put(interval, createIntervalStatistic(interval)); + intervalListIterator.next(); // discard the interval (we've already added it to the map) + interval = intervalListIterator.peek(); } - if (!intervalList.contains(interval)) - intervalList.add(interval); - intervalMap.put(interval, new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); } /** * Takes the interval, finds it in the stash, prints it to the VCF, and removes it * - * @param interval The interval in memory that you want to write out and clear - * @param allele the allele + * @param stats The statistics of the interval + * @param refAllele the reference allele */ - private void processIntervalStats(GenomeLoc interval, Allele allele) { - IntervalStatistics stats = intervalMap.get(interval); - + private void processIntervalStats(IntervalStatistics stats, Allele refAllele) { + GenomeLoc interval = stats.getInterval(); + List alleles = new ArrayList(); Map attributes = new HashMap(); ArrayList genotypes = new ArrayList(); - alleles.add(allele); - VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); + alleles.add(refAllele); + alleles.add(SYMBOLIC_ALLELE); + VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF vcb.filters(statusesToStrings(stats.callableStatuses())); attributes.put(VCFConstants.END_KEY, interval.getStop()); - attributes.put(VCFConstants.DEPTH_KEY, stats.totalCoverage()); - attributes.put("AV", stats.averageCoverage()); + attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); vcb = vcb.attributes(attributes); for (String sample : samples) { Map infos = new HashMap(); - infos.put("DP", stats.getSample(sample).totalCoverage()); - infos.put("AV", stats.getSample(sample).averageCoverage()); + infos.put(VCFConstants.DEPTH_KEY, stats.getSample(sample).averageCoverage()); Set filters = new HashSet(); filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses())); - genotypes.add(new Genotype(sample, alleles, VariantContext.NO_LOG10_PERROR, filters, infos, false)); + genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false)); } vcb = vcb.genotypes(genotypes); vcfWriter.add(vcb.make()); - intervalMap.remove(interval); } + /** + * Gets the header lines for the VCF writer + * + * @return A set of VCF header lines + */ + private static Set getHeaderInfo() { + Set headerLines = new HashSet(); + + // INFO fields for overall data + headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); + + // FORMAT fields for each genotype + headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + + // FILTER fields + for (CallableStatus stat : CallableStatus.values()) + headerLines.add(new VCFHeaderLine(stat.name(), stat.description)); + + return headerLines; + } + + private static Set statusesToStrings(Set statuses) { Set output = new HashSet(statuses.size()); @@ -303,4 +281,8 @@ public class DiagnoseTargets extends LocusWalker implements Annotato return output; } + + private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { + return new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 75f56808f..f3246407b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.HashMap; @@ -52,18 +53,28 @@ public class IntervalStatistics { return samples.get(sample); } + public GenomeLoc getInterval() { + return interval; + } + public void addLocus(AlignmentContext context) { ReadBackedPileup pileup = context.getBasePileup(); - for (String sample : samples.keySet()) - getSample(sample).addLocus(context.getLocation(), pileup.getPileupForSample(sample)); + Map samplePileups = pileup.getPileupsForSamples(samples.keySet()); + + for (Map.Entry entry : samplePileups.entrySet()) { + String sample = entry.getKey(); + ReadBackedPileup samplePileup = entry.getValue(); + SampleStatistics sampleStatistics = samples.get(sample); + + if (sampleStatistics == null) + throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); + + sampleStatistics.addLocus(context.getLocation(), samplePileup); + } + } - public long totalCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return preComputedTotalCoverage; - } public double averageCoverage() { if (preComputedTotalCoverage < 0) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java index c25517927..b9422b6e5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -36,10 +36,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -273,15 +270,14 @@ public class ConsensusAlleleCounter { builder.alleles(Arrays.asList(refAllele, altAllele)); builder.referenceBaseForIndel(ref.getBase()); builder.noGenotypes(); - if (doMultiAllelicCalls) + if (doMultiAllelicCalls) { + vcs.add(builder.make()); + if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) + break; + } else if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); vcs.add(builder.make()); - else { - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - vcs.clear(); - vcs.add(builder.make()); - } - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 93f5c0a43..aa4bde0ab 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -82,15 +82,22 @@ public class UnifiedArgumentCollection { public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; /** - * This argument is not enabled by default because it increases the runtime by an appreciable amount. + * Note that calculating the SLOD increases the runtime by an appreciable amount. */ @Argument(fullName = "noSLOD", shortName = "nosl", doc = "If provided, we will not calculate the SLOD", required = false) public boolean NO_SLOD = false; + /** + * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping. + * Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site. + */ + @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) + public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding */ - @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when in GENOTYPE_MODE = GENOTYPE_GIVEN_ALLELES", required=false) + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) public RodBinding alleles; /** @@ -105,8 +112,11 @@ public class UnifiedArgumentCollection { /** * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), - * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive. + * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it + * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend + * that you not play around with this parameter. */ + @Advanced @Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 3; @@ -171,6 +181,7 @@ public class UnifiedArgumentCollection { uac.GenotypingMode = GenotypingMode; uac.OutputMode = OutputMode; uac.NO_SLOD = NO_SLOD; + uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 8df501e1b..9036e3a62 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -39,6 +39,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -127,8 +129,19 @@ public class UnifiedGenotyper extends LocusWalker, Unif @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + // The following are not used by the Unified Genotyper public RodBinding getSnpEffRodBinding() { return null; } - public List> getCompRodBindings() { return Collections.emptyList(); } public List> getResourceRodBindings() { return Collections.emptyList(); } public boolean alwaysAppendDbsnpId() { return false; } @@ -203,6 +216,10 @@ public class UnifiedGenotyper extends LocusWalker, Unif * **/ public void initialize() { + // check for a bad max alleles value + if ( UAC.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) + throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); + // warn the user for misusing EMIT_ALL_SITES if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY && @@ -238,6 +255,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif // annotation (INFO) fields from UnifiedGenotyper if ( !UAC.NO_SLOD ) headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) + headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotyperEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site")); headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); // also, check to see whether comp rods were included diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index f26dfe22e..94d340926 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -51,6 +51,8 @@ import java.util.*; public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; + public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; @@ -365,6 +367,9 @@ public class UnifiedGenotyperEngine { if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); + if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) + attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); + if ( !UAC.NO_SLOD && !limitedContext && !bestGuessIsRef ) { //final boolean DEBUG_SLOD = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 171c42040..eb8b9d950 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -157,7 +157,7 @@ public class PairHMMIndelErrorModel { } - private void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, + private static void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, byte[] currentGOP, byte[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { if (indI > 0 && indJ > 0) { final int im1 = indI -1; @@ -183,9 +183,27 @@ public class PairHMMIndelErrorModel { } } - private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, + public static double computeReadLikehoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, + byte[] currentGOP, byte[] currentGCP, boolean bandedLikelihoods) { + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions + final int X_METRIC_LENGTH = readBases.length + 1; + final int Y_METRIC_LENGTH = haplotypeBases.length + 1; + + // initial arrays to hold the probabilities of being in the match, insertion and deletion cases + final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + + return computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentGOP, + currentGCP, 0, matchMetricArray, XMetricArray, YMetricArray, bandedLikelihoods); + + } + private static double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, byte[] currentGOP, byte[] currentGCP, int indToStart, - double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { + double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray, + boolean bandedLikelihoods) { final int X_METRIC_LENGTH = readBases.length+1; final int Y_METRIC_LENGTH = haplotypeBases.length+1; @@ -391,6 +409,9 @@ public class PairHMMIndelErrorModel { } } else { + if (DEBUG) { + System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); + } // System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); @@ -588,7 +609,7 @@ public class PairHMMIndelErrorModel { } - pairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); /* if (previousHaplotypeSeen == null) @@ -602,17 +623,14 @@ public class PairHMMIndelErrorModel { contextLogGapOpenProbabilities, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities, startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); - /* double r2 = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, contextLogGapOpenProbabilities, - contextLogGapContinuationProbabilities, 0, matchMetricArray, XMetricArray, YMetricArray); + double l2 = computeReadLikehoodGivenHaplotype(haplotypeBases, readBases, readQuals, contextLogGapOpenProbabilities, + contextLogGapContinuationProbabilities, bandedLikelihoods); - if (readLikelihood > 0) { - int k=0; - } - */ if (DEBUG) { + if (DEBUG) { System.out.println("H:"+new String(haplotypeBases)); System.out.println("R:"+new String(readBases)); System.out.format("L:%4.2f\n",readLikelihood); - // System.out.format("Lorig:%4.2f\n",r2); + // System.out.format("Lorig:%4.2f\n",r2); System.out.format("StPos:%d\n", startIndexInHaplotype); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java index d4bbacdf1..8887e3c4f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java @@ -68,7 +68,7 @@ public class VariantEvalReportWriter { */ public final void writeReport(final PrintStream out) { for ( int key = 0; key < stratManager.size(); key++ ) { - final String stratStateString = stratManager.getStratsAndStatesForKeyString(key); + final String stratStateString = stratManager.getStratsAndStatesStringForKey(key); final List> stratsAndStates = stratManager.getStratsAndStatesForKey(key); final EvaluationContext nec = stratManager.get(key); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 6c7922ea5..a73bc2c70 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -17,6 +17,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.DynamicStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.IntervalStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; @@ -221,6 +222,7 @@ public class VariantEvalWalker extends RodWalker implements Tr // The set of all possible evaluation contexts StratificationManager stratManager; + //Set dynamicStratifications = Collections.emptySet(); /** * Initialize the stratifications, evaluations, evaluation contexts, and reporting object @@ -360,6 +362,14 @@ public class VariantEvalWalker extends RodWalker implements Tr if (tracker != null) { String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases()); +// // update the dynamic stratifications +// for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) { +// // don't worry -- DynamicStratification only work with one eval object +// for ( final DynamicStratification ds : dynamicStratifications ) { +// ds.update(vc); +// } +// } + // --------- track --------- sample - VariantContexts - HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); @@ -456,13 +466,13 @@ public class VariantEvalWalker extends RodWalker implements Tr * @param sampleName * @return */ - private Collection getEvaluationContexts(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final VariantContext eval, - final String evalName, - final VariantContext comp, - final String compName, - final String sampleName ) { + protected Collection getEvaluationContexts(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final VariantContext eval, + final String evalName, + final VariantContext comp, + final String compName, + final String sampleName ) { final List> states = new LinkedList>(); for ( final VariantStratifier vs : stratManager.getStratifiers() ) { states.add(vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java index c22f82969..dda7e8611 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -41,51 +40,81 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; public class IndelSummary extends VariantEvaluator implements StandardEval { final protected static Logger logger = Logger.getLogger(IndelSummary.class); + // + // counts of snps and indels + // @DataPoint(description = "Number of SNPs", format = "%d") public int n_SNPs = 0; @DataPoint(description = "Number of singleton SNPs", format = "%d") public int n_singleton_SNPs = 0; - @DataPoint(description = "Number of Indels", format = "%d") + @DataPoint(description = "Number of indels", format = "%d") public int n_indels = 0; - // Number of Indels Sites (counts one for any number of alleles at site) - public int nIndelSites = 0; - - @DataPoint(description = "Number of singleton Indels", format = "%d") + @DataPoint(description = "Number of singleton indels", format = "%d") public int n_singleton_indels = 0; + // + // gold standard + // @DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d") public int n_indels_matching_gold_standard = 0; @DataPoint(description = "Percent of indels overlapping gold standard sites") public String gold_standard_matching_rate; - // counts 1 for each site where the number of alleles > 2 - public int nMultiIndelSites = 0; + // + // multi-allelics + // + // Number of Indels Sites (counts one for any number of alleles at site) + public int nIndelSites = 0; + + @DataPoint(description = "Number of sites with where the number of alleles is greater than 2") + public int n_multiallelic_indel_sites = 0; @DataPoint(description = "Percent of indel sites that are multi-allelic") public String percent_of_sites_with_more_than_2_alleles; + // + // snp : indel ratios + // @DataPoint(description = "SNP to indel ratio") public String SNP_to_indel_ratio; @DataPoint(description = "Singleton SNP to indel ratio") public String SNP_to_indel_ratio_for_singletons; + // + // novelty + // + @DataPoint(description = "Number of novel indels", format = "%d") + public int n_novel_indels = 0; + @DataPoint(description = "Indel novelty rate") public String indel_novelty_rate; - @DataPoint(description = "Frameshift percent") - public String frameshift_rate_for_coding_indels; - // // insertions to deletions // + @DataPoint(description = "Number of insertion indels") + public int n_insertions = 0; + + @DataPoint(description = "Number of deletion indels") + public int n_deletions = 0; + @DataPoint(description = "Insertion to deletion ratio") public String insertion_to_deletion_ratio; + @DataPoint(description = "Number of large (>10 bp) deletions") + public int n_large_deletions = 0; + + @DataPoint(description = "Number of large (>10 bp) insertions") + public int n_large_insertions = 0; + + @DataPoint(description = "Ratio of large (>10 bp) insertions to deletions") + public String insertion_to_deletion_ratio_for_large_indels; + // // Frameshifts // @@ -95,6 +124,9 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { @DataPoint(description = "Number of indels in protein-coding regions not labeled as frameshift") public int n_coding_indels_in_frame = 0; + @DataPoint(description = "Frameshift percent") + public String frameshift_rate_for_coding_indels; + // // Het : hom ratios // @@ -106,8 +138,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0; - int nKnownIndels = 0, nInsertions = 0; - int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used @@ -129,15 +159,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { public final static int LARGE_INDEL_SIZE_THRESHOLD = 10; - @DataPoint(description = "Number of large (>10 bp) deletions") - public int n_large_deletions = 0; - - @DataPoint(description = "Number of large (>10 bp) insertions") - public int n_large_insertions = 0; - - @DataPoint(description = "Ratio of large (>10 bp) insertions to deletions") - public String insertion_to_deletion_ratio_for_large_indels; - @Override public int getComparisonOrder() { return 2; } public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -158,10 +179,9 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { break; case INDEL: final VariantContext gold = getWalker().goldStandard == null ? null : tracker.getFirstValue(getWalker().goldStandard); - if ( eval.isComplexIndel() ) break; // don't count complex substitutions - + nIndelSites++; - if ( ! eval.isBiallelic() ) nMultiIndelSites++; + if ( ! eval.isBiallelic() ) n_multiallelic_indel_sites++; // collect information about het / hom ratio for ( final Genotype g : eval.getGenotypes() ) { @@ -172,13 +192,14 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { for ( Allele alt : eval.getAlternateAlleles() ) { n_indels++; // +1 for each alt allele if ( variantWasSingleton(eval) ) n_singleton_indels++; - if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific? + if ( comp == null ) n_novel_indels++; // TODO -- make this test allele specific? if ( gold != null ) n_indels_matching_gold_standard++; // ins : del ratios final int alleleSize = alt.length() - eval.getReference().length(); if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference()); - if ( alleleSize > 0 ) nInsertions++; + if ( alleleSize > 0 ) n_insertions++; + if ( alleleSize < 0 ) n_deletions++; // requires snpEFF annotations if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) { @@ -216,12 +237,12 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { } public void finalizeEvaluation() { - percent_of_sites_with_more_than_2_alleles = Utils.formattedRatio(nMultiIndelSites, nIndelSites); + percent_of_sites_with_more_than_2_alleles = Utils.formattedPercent(n_multiallelic_indel_sites, nIndelSites); SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels); SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels); - gold_standard_matching_rate = Utils.formattedNoveltyRate(n_indels_matching_gold_standard, n_indels); - indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels); + gold_standard_matching_rate = Utils.formattedPercent(n_indels_matching_gold_standard, n_indels); + indel_novelty_rate = Utils.formattedNoveltyRate(n_indels - n_novel_indels, n_indels); frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting); ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]); @@ -230,7 +251,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms); indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms); - insertion_to_deletion_ratio = Utils.formattedRatio(nInsertions, n_indels - nInsertions); + insertion_to_deletion_ratio = Utils.formattedRatio(n_insertions, n_deletions); insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java index bb4cab750..df4c3e860 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; public abstract class VariantEvaluator implements Comparable { @@ -67,4 +68,41 @@ public abstract class VariantEvaluator implements Comparable { public int compareTo(final VariantEvaluator variantEvaluator) { return getSimpleName().compareTo(variantEvaluator.getSimpleName()); } + + /** + * Evaluation modules that override this function to indicate that they support + * combining the results of two independent collections of eval data into + * a single meaningful result. The purpose of this interface is to + * allow us to cut up the input data into many independent stratifications, and then + * at the end of the eval run decide which stratifications to combine. This is + * important in the case of AC, where you may have thousands of distinct AC + * values that chop up the number of variants to too small a number of variants, + * and you'd like to combine the AC values into ranges containing some percent + * of the data. + * + * For example, suppose you have an eval that + * counts variants in a variable nVariants. If you want to be able to combine + * multiple evaluations of this type, overload the combine function + * with a function that sets this.nVariants += other.nVariants. + * + * Add in the appropriate fields of the VariantEvaluator T + * (of the same type as this object) to the values of this object. + * + * The values in this and other are implicitly independent, so that + * the values can be added together. + * + * @param other a VariantEvaluator of the same type of this object + */ + public void combine(final VariantEvaluator other) { + throw new ReviewedStingException(getSimpleName() + " doesn't support combining results, sorry"); + } + + /** + * Must be overloaded to return true for evaluation modules that support the combine operation + * + * @return + */ + public boolean supportsCombine() { + return false; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java new file mode 100644 index 000000000..21255f7b3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Tag this stratification as dynamically determining the final strat based on the input data + * + * The paradigm here is simple. We upfront create a strat with N states that reflect the finest grained + * possible division of the data. The data is processed, and statistics collected for each of the N states. + * An update call is made to the stratification for evaluation VariantContext during each map call, + * allowing the strat to collect data about the usage of each state. A final call requests that + * the stratification map down the N states into M states (typically less than N, not necessarily + * a subset of N). This is provided by returning a map from each of M state -> N states and + * the VariantEval walker will combine all of the evaluations for N into a single value for + * each M. + * + * For example, suppose I have a dynamic strat called AC, adopting 7 possible values 0,1,2,3,4,5,6. This + * strats tracks the number of eval vcs for each state, with final counts 0=1, 1=100, 2=10, 3=5, 4=3, 5=2, 6=1. + * The stratification attempts to combine the strats down to so that each state has approximately the same + * fraction of the data in each bin. Overall there is 1+100+10+5+3+2+1=124 observations and 7 bins so we really + * want ~ 18 observations in each bin. So we merge 3-6 with 5+3+2+1 = 11 and keep 2, 1, and 0 as distinct bins. We + * return a map from 0 -> 0, 1 -> 1, 2 -> 2, 3-6 -> {3,4,5,6}. + * + * TODO - some open implementation questions + * -- We should only create one stratifier overall. How do we track this? When we create the stratifiers + * perhaps we can look at them and create a tracker? + * -- How do we create a new stratifier based on the finalStratifications() given the framework? Conceptually + * this new thing is itself a stratifier, just like before, but it's states are determined at the end. We'd + * then like to call not getRelevantStates but a different function that accepts an old state and returns + * the new state. Perhaps the process should look like: + * finalizeStratification -> new Stratifier whose states are the final ones + * getNewState(old state) -> new state (one of those in getFinalStratification) + * + * @author Mark DePristo + * @since 4/9/12 + */ +public interface DynamicStratification { + public void update(final VariantContext eval); + public VariantStratifier finalizeStratification(); + public Object getFinalState(final Object oldState); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java index fe4f7641f..65633bc2b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java @@ -50,7 +50,7 @@ public class OneBPIndel extends VariantStratifier { public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (eval != null && eval.isIndel()) { for ( int l : eval.getIndelLengths() ) - if ( l > 1 ) + if ( Math.abs(l) > 1 ) return TWO_PLUS_BP; // someone is too long return ONE_BP; // all lengths are one } else diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java index 86821fbc1..5e8db8107 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manage import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -54,16 +56,27 @@ public class StratificationManager implements Map strats) { - stratifiers = new ArrayList(strats); + this.stratifiers = new ArrayList(strats); + + // construct and store the full tree of strats this.root = buildStratificationTree(new LinkedList(strats)); + // assign the linear key ordering to the leafs assignKeys(root); + // cache the size, and check for a bad state this.size = root.size(); if ( this.size == 0 ) throw new ReviewedStingException("Size == 0 in StratificationManager"); + // prepare the assocated data vectors mapping from key -> data this.valuesByKey = new ArrayList(size()); this.stratifierValuesByKey = new ArrayList>(size()); this.keyStrings = new ArrayList(size()); @@ -72,9 +85,20 @@ public class StratificationManager implements Map buildStratificationTree(final Queue strats) { final K first = strats.poll(); if ( first == null ) { @@ -97,6 +121,10 @@ public class StratificationManager implements Map root) { int key = 0; @@ -106,15 +134,23 @@ public class StratificationManager implements Map root) { + /** + * Entry point to recursive tool that fills in the list of state values corresponding + * to each key. After this function is called you can map from key -> List of StateValues + * instead of walking the tree to find the key and reading the list of state values + * + * @param root + */ + private void assignStratifierValuesByKey(final StratNode root) { assignStratifierValuesByKey(root, new LinkedList()); - + + // do a last sanity check that no key has null value after assigning for ( List stateValues : stratifierValuesByKey ) if ( stateValues == null ) throw new ReviewedStingException("Found a null state value set that's null"); } - public void assignStratifierValuesByKey(final StratNode node, final LinkedList states) { + private void assignStratifierValuesByKey(final StratNode node, final LinkedList states) { if ( node.isLeaf() ) { // we're here! if ( states.isEmpty() ) throw new ReviewedStingException("Found a leaf node with an empty state values vector"); @@ -134,13 +170,17 @@ public class StratificationManager implements Map= 0") public int size() { return size; } @Ensures("result != null") - public StratNode getRoot() { + protected StratNode getRoot() { return root; } @@ -188,7 +228,7 @@ public class StratificationManager implements Map implements Map> combineStates(final List first, final List second) { - List> combined = new ArrayList>(first.size()); + final List> combined = new ArrayList>(first.size()); for ( int i = 0; i < first.size(); i++ ) { final Object firstI = first.get(i); final Object secondI = second.get(i); @@ -310,4 +350,77 @@ public class StratificationManager implements Map { + /** take two values of type V and return a combined value of type V */ + public V combine(final V lhs, final V rhs); + } + + /** + * Remaps the stratifications from one stratification set to another, combining + * the values in V according to the combiner function. + * + * stratifierToReplace defines a set of states S1, while newStratifier defines + * a new set S2. remappedStates is a map from all of S1 into at least some of + * S2. This function creates a new, fully initialized manager where all of the + * data in this new manager is derived from the original data in this object + * combined according to the mapping remappedStates. When multiple + * elements of S1 can map to the same value in S2, these are sequentially + * combined by the function combiner. Suppose for example at states s1, s2, and + * s3 all map to N1. Eventually the value associated with state N1 would be + * + * value(N1) = combine(value(s1), combine(value(s2), value(s3)) + * + * in some order for s1, s2, and s3, which is not defined. Note that this function + * only supports combining one stratification at a time, but in principle a loop over + * stratifications and this function could do the multi-dimensional collapse. + * + * @param stratifierToReplace + * @param newStratifier + * @param combiner + * @param remappedStates + * @return + */ + public StratificationManager combineStrats(final K stratifierToReplace, + final K newStratifier, + final Combiner combiner, + final Map remappedStates) { + // make sure the mapping is reasonable + if ( ! newStratifier.getAllStates().containsAll(remappedStates.values()) ) + throw new ReviewedStingException("combineStrats: remapped states contains states not found in newStratifer state set"); + + if ( ! remappedStates.keySet().containsAll(stratifierToReplace.getAllStates()) ) + throw new ReviewedStingException("combineStrats: remapped states missing mapping for some states"); + + // the new strats are the old ones with the single replacement + final List newStrats = new ArrayList(getStratifiers()); + final int stratOffset = newStrats.indexOf(stratifierToReplace); + if ( stratOffset == -1 ) + throw new ReviewedStingException("Could not find strat to replace " + stratifierToReplace + " in existing strats " + newStrats); + newStrats.set(stratOffset, newStratifier); + + // create an empty but fully initialized new manager + final StratificationManager combined = new StratificationManager(newStrats); + + // for each key, get its state, update it according to the map, and update the combined manager + for ( int key = 0; key < size(); key++ ) { + // the new state is just the old one with the replacement + final List newStates = new ArrayList(getStatesForKey(key)); + final Object oldState = newStates.get(stratOffset); + final Object newState = remappedStates.get(oldState); + newStates.set(stratOffset, newState); + + // look up the new key given the new state + final int combinedKey = combined.getKey(newStates); + if ( combinedKey == -1 ) throw new ReviewedStingException("Couldn't find key for states: " + Utils.join(",", newStates)); + + // combine the old value with whatever new value is in combined already + final V combinedValue = combiner.combine(combined.get(combinedKey), get(key)); + + // update the value associated with combined key + combined.set(combinedKey, combinedValue); + } + + return combined; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java index 9363bbd79..390682837 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -14,15 +15,23 @@ import java.util.*; public final class EvaluationContext { // NOTE: must be hashset to avoid O(log n) cost of iteration in the very frequently called apply function - private final HashSet evaluationInstances; + final VariantEvalWalker walker; + private final ArrayList evaluationInstances; + private final Set> evaluationClasses; public EvaluationContext(final VariantEvalWalker walker, final Set> evaluationClasses) { - evaluationInstances = new HashSet(evaluationClasses.size()); + this(walker, evaluationClasses, true); + } + + private EvaluationContext(final VariantEvalWalker walker, final Set> evaluationClasses, final boolean doInitialize) { + this.walker = walker; + this.evaluationClasses = evaluationClasses; + this.evaluationInstances = new ArrayList(evaluationClasses.size()); for ( final Class c : evaluationClasses ) { try { final VariantEvaluator eval = c.newInstance(); - eval.initialize(walker); + if ( doInitialize ) eval.initialize(walker); evaluationInstances.add(eval); } catch (InstantiationException e) { throw new ReviewedStingException("Unable to instantiate eval module '" + c.getSimpleName() + "'", e); @@ -62,4 +71,20 @@ public final class EvaluationContext { } } } + + public void combine(final EvaluationContext rhs) { + for ( int i = 0; i < evaluationInstances.size(); i++ ) + evaluationInstances.get(i).combine(rhs.evaluationInstances.get(i)); + } + + public final static EvaluationContextCombiner COMBINER = new EvaluationContext.EvaluationContextCombiner(); + private static class EvaluationContextCombiner implements StratificationManager.Combiner { + @Override + public EvaluationContext combine(EvaluationContext lhs, final EvaluationContext rhs) { + if ( lhs == null ) + lhs = new EvaluationContext(rhs.walker, rhs.evaluationClasses, false); + lhs.combine(rhs); + return lhs; + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index a2782fe34..a957bfd85 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -165,9 +165,9 @@ public class VariantDataManager { bottomPercentage = ((float) numToAdd) / ((float) data.size()); } int index = 0, numAdded = 0; - while( numAdded < numToAdd ) { + while( numAdded < numToAdd && index < data.size() ) { final VariantDatum datum = data.get(index++); - if( !datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) { + if( datum != null && !datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) { datum.atAntiTrainingSite = true; trainingData.add( datum ); numAdded++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 3066b0bc6..18b8424b2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -157,6 +157,12 @@ public class CombineVariants extends RodWalker { @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) public int minimumN = 1; + /** + * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs. + */ + @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false) + public boolean SUPPRESS_COMMAND_LINE_HEADER = false; + @Hidden @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; @@ -183,7 +189,9 @@ public class CombineVariants extends RodWalker { Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); if ( SET_KEY != null ) headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); - vcfWriter.writeHeader(new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples)); + VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples); + vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); + vcfWriter.writeHeader(vcfHeader); if ( vcfWriter instanceof VCFWriterStub) { sitesOnlyVCF = ((VCFWriterStub)vcfWriter).doNotWriteGenotypes(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java new file mode 100755 index 000000000..714fb938e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.apache.commons.io.FilenameUtils; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; +import org.broadinstitute.sting.utils.text.ListFileUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.io.File; +import java.util.*; + +/** + * Selects headers from a VCF source. + *

+ *

+ * Often, a VCF containing many headers will need to be subset in order to facilitate certain formatting guidelines. + * SelectHeaders can be used for this purpose. Given a single VCF file, one or more headers can be extracted from the + * file (based on a complete header name or a pattern match). + *

+ *

Input

+ *

+ * A set of VCFs. + *

+ *

+ *

Output

+ *

+ * A header selected VCF. + *

+ *

+ *

Examples

+ *
+ * Select only the FILTER, FORMAT, and INFO headers:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectHeaders \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -hn FILTER \
+ *   -hn FORMAT \
+ *   -hn INFO
+ *
+ * Select only the FILTER, FORMAT, and INFO headers and add in the reference file names:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectHeaders \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -hn FILTER \
+ *   -hn FORMAT \
+ *   -hn INFO \
+ *   -irn \
+ *   -iln
+ *
+ * Select only the FILTER, FORMAT, and INFO headers, plus any headers with SnpEff:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectHeaders \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -hn FILTER \
+ *   -hn FORMAT \
+ *   -hn INFO \
+ *   -he '.*SnpEff.*'
+ * 
+ */ +@SuppressWarnings("unused") +public class SelectHeaders extends RodWalker implements TreeReducible { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Output(doc = "File to which variants should be written", required = true) + protected VCFWriter vcfWriter; + + @Argument(fullName = "header_name", shortName = "hn", doc = "Include header. Can be specified multiple times", required = false) + public Set headerNames; + + @Argument(fullName = "header_expression", shortName = "he", doc = "Regular expression to select many headers from the tracks provided. Can be specified multiple times", required = false) + public Set headerExpressions; + + /** + * Note that header exclusion takes precedence over inclusion, so that if a header is in both lists it will be excluded. + */ + @Argument(fullName = "exclude_header_name", shortName = "xl_hn", doc = "Exclude header. Can be specified multiple times", required = false) + public Set XLheaderNames; + + /** + * Note that reference inclusion takes precedence over other header matching. If set other reference lines may be excluded but the file name will still be added. + */ + @Argument(fullName = "include_reference_name", shortName = "irn", doc = "If set the reference file name minus the file extension will be added to the headers", required = false) + public boolean includeReference; + + /** + * Note that interval name inclusion takes precedence over other header matching. If set other interval lines may be excluded but the intervals will still be added. + */ + @Argument(fullName = "include_interval_names", shortName = "iln", doc = "If set the interval file name minus the file extension, or the command line intervals, will be added to the headers", required = false) + public boolean includeIntervals; + + /** + * Note that engine header inclusion takes precedence over other header matching. If set other engine lines may be excluded but the intervals will still be added. + */ + @Hidden // TODO: Determine if others find this valuable and either remove @Hidden or remove -ieh. + @Argument(fullName = "include_engine_headers", shortName = "ieh", doc = "If set the headers normally output by the engine will be added to the headers", required = false) + public boolean includeEngineHeaders; + + private static final ListFileUtils.StringConverter headerKey = new ListFileUtils.StringConverter() { + @Override + public String convert(VCFHeaderLine value) { + return value.getKey(); + } + }; + + /** + * Set up the VCF writer, the header expressions and regexps + */ + @Override + public void initialize() { + // Get list of samples to include in the output + List rodNames = Arrays.asList(variantCollection.variants.getName()); + + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); + + headerLines.add(new VCFHeaderLine(VCFHeader.SOURCE_KEY, "SelectHeaders")); + + // Select only the headers requested by name or expression. + headerLines = new LinkedHashSet(getSelectedHeaders(headerLines)); + + // Optionally add in the reference. + if (includeReference && getToolkit().getArguments().referenceFile != null) + headerLines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, FilenameUtils.getBaseName(getToolkit().getArguments().referenceFile.getName()))); + + // Optionally add in the intervals. + if (includeIntervals && getToolkit().getArguments().intervals != null) { + for (IntervalBinding intervalBinding : getToolkit().getArguments().intervals) { + String source = intervalBinding.getSource(); + if (source == null) + continue; + File file = new File(source); + if (file.exists()) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); + } else { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); + } + } + } + + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + VCFHeader vcfHeader = new VCFHeader(headerLines, vcfSamples); + vcfHeader.setWriteEngineHeaders(includeEngineHeaders); + vcfWriter.writeHeader(vcfHeader); + } + + private Set getSelectedHeaders(Set headerLines) { + Set selectedHeaders = new TreeSet(); + if (headerNames == null && headerExpressions == null) { + // Include everything if nothing was explicitly included. + selectedHeaders.addAll(headerLines); + } else { + // Only include the selected headers. + if (headerNames != null) + selectedHeaders.addAll(ListFileUtils.includeMatching(headerLines, headerKey, headerNames, true)); + if (headerExpressions != null) + selectedHeaders.addAll(ListFileUtils.includeMatching(headerLines, headerKey, headerExpressions, false)); + } + + // Remove any excluded headers. + if (XLheaderNames != null) + selectedHeaders = ListFileUtils.excludeMatching(selectedHeaders, headerKey, XLheaderNames, true); + return selectedHeaders; + } + + /** + * Pass through the VC record + * + * @param tracker the ROD tracker + * @param ref reference information + * @param context alignment info + * @return number of records processed + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + int count = 0; + if (tracker != null) { + Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); + if (vcs != null) { + for (VariantContext vc : vcs) { + vcfWriter.add(vc); + count++; + } + } + } + return count; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } + + @Override + public void onTraversalDone(Integer result) { + logger.info(result + " records processed."); + } +} diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index a3f80af1c..dcdef5aab 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -194,6 +194,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { */ private static final List gatkPackages = Arrays.asList( "org.broadinstitute.sting.gatk", + "org.broadinstitute.sting.pipeline", "org.broadinstitute.sting.analyzecovariates", "org.broadinstitute.sting.gatk.datasources.reads.utilities"); @@ -251,7 +252,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { */ private void writeFilter(String className, List argumentFields, Set> dependents) throws IOException { String content = getContent(TRAIT_TEMPLATE, "org.broadinstitute.sting.queue.function.CommandLineFunction", - className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents); + className, "", false, String.format(" + \" --read_filter %s\"", className), argumentFields, dependents); writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content); } diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java index 7d393274a..d029454c9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java @@ -52,7 +52,7 @@ public class PairHMM { } - public void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, + public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, final int X_METRIC_LENGTH) { for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java b/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java new file mode 100644 index 000000000..b52eed5cf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.R; + +import org.apache.commons.lang.StringUtils; + +import java.text.SimpleDateFormat; +import java.util.Collection; +import java.util.Date; + +public class RUtils { + /** + * Converts a collection of values to an R compatible list. A null list will return NA, + * otherwise the values will be escaped with single quotes and combined with c(). + * @param list Collection of values + * @return The R representation of the list + */ + public static String toStringList(Collection list) { + if (list == null) + return "NA"; + if (list.size() == 0) + return "c()"; + return "c('" + StringUtils.join(list, "','") + "')"; + } + + /** + * Converts a collection of values to an R compatible list. A null list will return NA, + * otherwise the values will be combined with c(). + * @param list Collection of values + * @return The R representation of the list + */ + public static String toNumberList(Collection list) { + return list == null ? "NA": "c(" + StringUtils.join(list, ",") + ")"; + } + + /** + * Converts a collection of values to an R compatible list. A null list will return NA, + * otherwise the date will be escaped with single quotes and combined with c(). + * @param list Collection of values + * @return The R representation of the list + */ + public static String toDateList(Collection list) { + return toDateList(list, "''yyyy-MM-dd''"); + } + + /** + * Converts a collection of values to an R compatible list formatted by pattern. + * @param list Collection of values + * @param pattern format pattern string for each date + * @return The R representation of the list + */ + public static String toDateList(Collection list, String pattern) { + + if (list == null) + return "NA"; + SimpleDateFormat format = new SimpleDateFormat(pattern); + StringBuilder sb = new StringBuilder(); + sb.append("c("); + boolean first = true; + for (Date date : list) { + if (!first) sb.append(","); + sb.append(format.format(date)); + first = false; + } + sb.append(")"); + return sb.toString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 68b220aab..360a855fa 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -31,14 +31,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.text.ListFileUtils; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; import java.io.FileNotFoundException; import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; /** @@ -74,10 +73,10 @@ public class SampleUtils { * Same as @link getSAMFileSamples but gets all of the samples * in the SAM files loaded by the engine * - * @param engine - * @return + * @param engine engine + * @return samples */ - public final static Set getSAMFileSamples(GenomeAnalysisEngine engine) { + public static Set getSAMFileSamples(GenomeAnalysisEngine engine) { return SampleUtils.getSAMFileSamples(engine.getSAMFileHeader()); } @@ -209,89 +208,24 @@ public class SampleUtils { * we try to read a file named E from disk, and if possible all lines from that file are expanded * into unique sample names. * - * @param sampleArgs - * @return + * @param sampleArgs args + * @return samples */ public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { if (sampleArgs != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // sample list set, and treat the entries as if they had been specified on the command line. - Set samplesFromFiles = new HashSet(); - for (String SAMPLE_EXPRESSION : sampleArgs) { - File sampleFile = new File(SAMPLE_EXPRESSION); - - try { - XReadLines reader = new XReadLines(sampleFile); - - List lines = reader.readLines(); - for (String line : lines) { - samplesFromFiles.add(line.trim()); - } - } catch (FileNotFoundException e) { - samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample - } - } - - return samplesFromFiles; + return ListFileUtils.unpackSet(sampleArgs); } return new HashSet(); } public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { - Set samples = new HashSet(); - - if (sampleExpressions != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // sample list set, and treat the entries as if they had been specified on the command line. - Set samplesFromFiles = new HashSet(); - for (String sampleExpression : sampleExpressions) { - File sampleFile = new File(sampleExpression); - - try { - XReadLines reader = new XReadLines(sampleFile); - - List lines = reader.readLines(); - for (String line : lines) { - samplesFromFiles.add(line); - } - } catch (FileNotFoundException e) { - // ignore exception - } - } - - sampleExpressions.addAll(samplesFromFiles); - - // Let's now assume that the values in sampleExpressions are literal sample names and not regular - // expressions. Extract those samples specifically so we don't make the mistake of selecting more - // than what the user really wants. - Set possibleSampleRegexs = new HashSet(); - for (String sampleExpression : sampleExpressions) { - if (!(new File(sampleExpression).exists())) { - if (vcfSamples.contains(sampleExpression)) { - samples.add(sampleExpression); - } else { - possibleSampleRegexs.add(sampleExpression); - } - } - } - - // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions - for (String sampleRegex : possibleSampleRegexs) { - Pattern p = Pattern.compile(sampleRegex); - - for (String vcfSample : vcfSamples) { - Matcher m = p.matcher(vcfSample); - if (m.find()) { - samples.add(vcfSample); - } - } - } + Set samples = ListFileUtils.unpackSet(vcfSamples); + if (sampleExpressions == null) { + return samples; } else { - samples.addAll(vcfSamples); + return ListFileUtils.includeMatching(samples, sampleExpressions, false); } - - return samples; } /** @@ -304,16 +238,7 @@ public class SampleUtils { // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions Set samples = new HashSet(); if (sampleExpressions != null) { - for (String expression : sampleExpressions) { - Pattern p = Pattern.compile(expression); - - for (String originalSample : originalSamples) { - Matcher m = p.matcher(originalSample); - if (m.find()) { - samples.add(originalSample); - } - } - } + samples.addAll(ListFileUtils.includeMatching(originalSamples, sampleExpressions, false)); } return samples; } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index c2c608903..7b627fba2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -750,4 +750,18 @@ public class Utils { public static String formattedRatio(final long num, final long denom) { return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); } + + /** + * Create a constant map that maps each value in values to itself + * @param values + * @param + * @return + */ + public static Map makeIdentityFunctionMap(Collection values) { + Map map = new HashMap(values.size()); + for ( final T value : values ) + map.put(value, value); + return Collections.unmodifiableMap(map); + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 37822dc84..764be2ac7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -15,7 +15,7 @@ import java.util.ArrayList; * Date: 1/4/12 */ -public class ActiveRegion implements HasGenomeLocation { +public class ActiveRegion implements HasGenomeLocation, Comparable { private final ArrayList reads = new ArrayList(); private final GenomeLoc activeRegionLoc; @@ -73,6 +73,11 @@ public class ActiveRegion implements HasGenomeLocation { Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); } + @Override + public int compareTo( final ActiveRegion other ) { + return this.getLocation().compareTo(other.getLocation()); + } + @Override public GenomeLoc getLocation() { return activeRegionLoc; } public GenomeLoc getExtendedLoc() { return extendedLoc; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 1499f639d..6ef5a2af2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -24,8 +24,10 @@ package org.broadinstitute.sting.utils.activeregion; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; @@ -45,8 +47,16 @@ public class ActivityProfile { final boolean presetRegions; GenomeLoc regionStartLoc = null; final List isActiveList; - private GenomeLoc lastLoc = null; + private static final int FILTER_SIZE = 65; + private static final Double[] GaussianKernel; + + static { + GaussianKernel = new Double[2*FILTER_SIZE + 1]; + for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) { + GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 40.0, iii); + } + } // todo -- add upfront the start and stop of the intervals // todo -- check that no regions are unexpectedly missing @@ -85,15 +95,13 @@ public class ActivityProfile { public ActivityProfile bandPassFilter() { final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]); final Double[] filteredProbArray = new Double[activeProbArray.length]; - final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author - for( int iii = 0; iii < activeProbArray.length; iii++ ) { - double maxVal = 0; - for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) { - if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } + if( !presetRegions ) { + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + final Double[] kernel = (Double[]) ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); + final Double[] activeProbSubArray = (Double[]) ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); + filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); } - filteredProbArray[iii] = maxVal; } - return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc); } @@ -102,9 +110,9 @@ public class ActivityProfile { * @param activeRegionExtension * @return */ - public List createActiveRegions( final int activeRegionExtension ) { - final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author - final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author + public List createActiveRegions( final int activeRegionExtension, final int maxRegionSize ) { + final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author + final ArrayList returnList = new ArrayList(); if( isActiveList.size() == 0 ) { // no elements in the active list, just return an empty one @@ -112,25 +120,22 @@ public class ActivityProfile { } else if( isActiveList.size() == 1 ) { // there's a single element, it's either active or inactive boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; - final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension ); - return Collections.singletonList(region); + returnList.addAll(createActiveRegion(isActive, 0, 0, activeRegionExtension, maxRegionSize)); } else { // there are 2+ elements, divide these up into regions - final ArrayList returnList = new ArrayList(); boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; int curStart = 0; for(int iii = 1; iii < isActiveList.size(); iii++ ) { final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD; - if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { - returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) ); + if( isActive != thisStatus ) { + returnList.addAll(createActiveRegion(isActive, curStart, iii - 1, activeRegionExtension, maxRegionSize)); isActive = thisStatus; curStart = iii; } } - returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); // close out the current active region - - return returnList; + returnList.addAll(createActiveRegion(isActive, curStart, isActiveList.size() - 1, activeRegionExtension, maxRegionSize)); // close out the current active region } + return returnList; } /** @@ -141,8 +146,25 @@ public class ActivityProfile { * @param activeRegionExtension * @return a fully initialized ActiveRegion with the above properties */ - private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) { - final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); - return new ActiveRegion( loc, isActive, parser, activeRegionExtension ); + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { + return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); + } + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { + if( !isActive || curEnd - curStart < maxRegionSize ) { + final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); + returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension)); + return returnList; + } + // find the best place to break up the large active region + Double minProb = Double.MAX_VALUE; + int cutPoint = -1; + for( int iii = curStart + 45; iii < curEnd - 45; iii++ ) { // BUGBUG: assumes maxRegionSize >> 45 + if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; } + } + final List leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList()); + final List rightList = createActiveRegion(isActive, cutPoint, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); + returnList.addAll( leftList ); + returnList.addAll( rightList ); + return returnList; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 0dec305d2..2c4c4f607 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -210,7 +210,12 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { final List alleles = parseAlleles(ref, alts, lineNo); // find out our location - final int start = Integer.valueOf(locParts[1]); + int start = 0; + try { + start = Integer.valueOf(locParts[1]); + } catch (Exception e) { + generateException("the value in the POS field must be an integer but it was " + locParts[1], lineNo); + } int stop = start; // ref alleles don't need to be single bases for monomorphic sites diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 27bab8c41..50ff3a656 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -1,5 +1,28 @@ -package org.broadinstitute.sting.utils.codecs.vcf; +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.util.ParsingUtils; @@ -35,6 +58,11 @@ public class VCFHeader { // the header string indicator public static final String HEADER_INDICATOR = "#"; + public static final String SOURCE_KEY = "source"; + public static final String REFERENCE_KEY = "reference"; + public static final String CONTIG_KEY = "contig"; + public static final String INTERVALS_KEY = "intervals"; + // were the input samples sorted originally (or are we sorting them)? private boolean samplesWereAlreadySorted = true; @@ -42,6 +70,8 @@ public class VCFHeader { protected ArrayList sampleNamesInOrder = null; protected HashMap sampleNameToOffset = null; + private boolean writeEngineHeaders = true; + private boolean writeCommandLine = true; /** * create a VCF header, given a list of meta data and auxillary tags @@ -79,6 +109,7 @@ public class VCFHeader { * using this header (i.e., read by the VCFCodec) will have genotypes * occurring in the same order * + * @param genotypeSampleNamesInAppearenceOrder genotype sample names */ protected void buildVCFReaderMaps(List genotypeSampleNamesInAppearenceOrder) { @@ -144,10 +175,7 @@ public class VCFHeader { * @return a set of the header fields, in order */ public Set getHeaderFields() { - Set fields = new LinkedHashSet(); - for (HEADER_FIELDS field : HEADER_FIELDS.values()) - fields.add(field); - return fields; + return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values())); } /** @@ -217,7 +245,36 @@ public class VCFHeader { public VCFHeaderLine getOtherHeaderLine(String key) { return mOtherMetaData.get(key); } + + /** + * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. + * @return true if additional engine headers will be written to the VCF + */ + public boolean isWriteEngineHeaders() { + return writeEngineHeaders; + } + + /** + * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. + * @param writeEngineHeaders true if additional engine headers will be written to the VCF + */ + public void setWriteEngineHeaders(boolean writeEngineHeaders) { + this.writeEngineHeaders = writeEngineHeaders; + } + + /** + * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. + * @return true if the command line will be written to the VCF + */ + public boolean isWriteCommandLine() { + return writeCommandLine; + } + + /** + * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. + * @param writeCommandLine true if the command line will be written to the VCF + */ + public void setWriteCommandLine(boolean writeCommandLine) { + this.writeCommandLine = writeCommandLine; + } } - - - diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index ea6901bb3..e3107c195 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -677,11 +677,11 @@ public abstract class AbstractReadBackedPileup filteredElements = tracker.getElements(sampleNames); return filteredElements != null ? (RBP) createNewPileup(loc, filteredElements) : null; } else { - HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop + HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. + if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) filteredTracker.add(p); } else { @@ -693,6 +693,38 @@ public abstract class AbstractReadBackedPileup getPileupsForSamples(Collection sampleNames) { + Map result = new HashMap(); + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (String sample : sampleNames) { + PileupElementTracker filteredElements = tracker.getElements(sampleNames); + if (filteredElements != null) + result.put(sample, createNewPileup(loc, filteredElements)); + } + } else { + Map> trackerMap = new HashMap>(); + + for (String sample : sampleNames) { // initialize pileups for each sample + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + trackerMap.put(sample, filteredTracker); + } + for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup + GATKSAMRecord read = p.getRead(); + if (read.getReadGroup() != null) { + String sample = read.getReadGroup().getSample(); + UnifiedPileupElementTracker tracker = trackerMap.get(sample); + if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest + tracker.add(p); + } + } + for (Map.Entry> entry : trackerMap.entrySet()) // create the RBP for each sample + result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); + } + return result; + } + @Override public RBP getPileupForSample(String sampleName) { diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 771721169..81ba00888 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -32,8 +32,6 @@ public class PileupElement implements Comparable { protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases - - /** * Creates a new pileup element. * diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 110199f06..f15468840 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Map; /** * A data retrieval interface for accessing parts of the pileup. @@ -159,6 +160,16 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForSamples(Collection sampleNames); + /** + * Gets the particular subset of this pileup for each given sample name. + * + * Same as calling getPileupForSample for all samples, but in O(n) instead of O(n^2). + * + * @param sampleNames Name of the sample to use. + * @return A subset of this pileup containing only reads with the given sample. + */ + public Map getPileupsForSamples(Collection sampleNames); + /** * Gets the particular subset of this pileup with the given sample name. diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java index c146bf4d4..a3bc7a75f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java @@ -34,9 +34,9 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.io.IOException; +import java.util.*; +import java.util.regex.Pattern; /** * A collection of convenience methods for working with list files. @@ -54,6 +54,7 @@ public class ListFileUtils { * LIST_FILE_COMMENT_START are ignored. * * @param samFiles The sam files, in string format. + * @param parser Parser * @return a flattened list of the bam files provided */ public static List unpackBAMFileList(final List samFiles, final ParsingEngine parser) { @@ -63,10 +64,8 @@ public class ListFileUtils { inputFileName = expandFileName(inputFileName); if (inputFileName.toLowerCase().endsWith(".list") ) { try { - for ( String fileName : new XReadLines(new File(inputFileName), true) ) { - if ( fileName.length() > 0 && ! fileName.startsWith(LIST_FILE_COMMENT_START) ) { - unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName))); - } + for ( String fileName : new XReadLines(new File(inputFileName), true, LIST_FILE_COMMENT_START) ) { + unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName))); } } catch( FileNotFoundException ex ) { @@ -91,9 +90,11 @@ public class ListFileUtils { /** * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. * @param RODBindings a text equivale + * @param parser Parser * @return a list of expanded, bound RODs. */ @Deprecated + @SuppressWarnings("unused") // TODO: Who is still using this? External walkers? public static Collection unpackRODBindingsOldStyle(final Collection RODBindings, final ParsingEngine parser) { // todo -- this is a strange home for this code. Move into ROD system Collection rodBindings = new ArrayList(); @@ -112,7 +113,7 @@ public class ListFileUtils { String name = positionalTags.get(0); String type = positionalTags.get(1); - RMDTriplet.RMDStorageType storageType = null; + RMDTriplet.RMDStorageType storageType; if(tags.getValue("storage") != null) storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,tags.getValue("storage")); else if(fileName.toLowerCase().endsWith("stdin")) @@ -129,9 +130,11 @@ public class ListFileUtils { /** * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. * @param RODBindings a text equivale + * @param parser Parser * @return a list of expanded, bound RODs. */ - public static Collection unpackRODBindings(final Collection RODBindings, final ParsingEngine parser) { + @SuppressWarnings("unchecked") + public static Collection unpackRODBindings(final Collection RODBindings, @SuppressWarnings("unused") final ParsingEngine parser) { // todo -- this is a strange home for this code. Move into ROD system Collection rodBindings = new ArrayList(); FeatureManager builderForValidation = new FeatureManager(); @@ -142,7 +145,7 @@ public class ListFileUtils { String name = rodBinding.getName(); String type = rodBinding.getTribbleType(); - RMDTriplet.RMDStorageType storageType = null; + RMDTriplet.RMDStorageType storageType; if(rodBinding.getTags().getValue("storage") != null) storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,rodBinding.getTags().getValue("storage")); else if(fileName.toLowerCase().endsWith("stdin")) @@ -184,4 +187,157 @@ public class ListFileUtils { return "/dev/stdin"; return argument; } + + /** + * Returns a new set of values, containing a final set of values expanded from values + *

+ * Each element E of values can either be a literal string or a file ending in .list. + * For each E ending in .list we try to read a file named E from disk, and if possible + * all lines from that file are expanded into unique values. + * + * @param values Original values + * @return entries from values or the files listed in values + */ + public static Set unpackSet(Collection values) { + if (values == null) + throw new NullPointerException("values cannot be null"); + Set unpackedValues = new LinkedHashSet(); + // Let's first go through the list and see if we were given any files. + // We'll add every entry in the file to our set, and treat the entries as + // if they had been specified on the command line. + for (String value : values) { + File file = new File(value); + if (value.toLowerCase().endsWith(".list") && file.exists()) { + try { + unpackedValues.addAll(new XReadLines(file, true, LIST_FILE_COMMENT_START).readLines()); + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + } else { + unpackedValues.add(value); + } + } + return unpackedValues; + } + + /** + * Returns a new set of values including only values listed by filters + *

+ * Each element E of values can either be a literal string or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique names. + *

+ * Filters may also be a file of filters. + * + * @param values Values or files with values + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values or the files listed in values, filtered by filters + */ + public static Set includeMatching(Collection values, Collection filters, boolean exactMatch) { + return includeMatching(values, IDENTITY_STRING_CONVERTER, filters, exactMatch); + } + + /** + * Converts a type T to a String representation. + * + * @param Type to convert to a String. + */ + public static interface StringConverter { + String convert(T value); + } + + /** + * Returns a new set of values including only values matching filters + *

+ * Filters may also be a file of filters. + *

+ * The converter should convert T to a unique String for each value in the set. + * + * @param values Values or files with values + * @param converter Converts values to strings + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values including only values matching filters + */ + public static Set includeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { + if (values == null) + throw new NullPointerException("values cannot be null"); + if (converter == null) + throw new NullPointerException("converter cannot be null"); + if (filters == null) + throw new NullPointerException("filters cannot be null"); + + Set unpackedFilters = unpackSet(filters); + Set filteredValues = new LinkedHashSet(); + Collection patterns = null; + if (!exactMatch) + patterns = compilePatterns(unpackedFilters); + for (T value : values) { + String converted = converter.convert(value); + if (unpackedFilters.contains(converted)) { + filteredValues.add(value); + } else if (!exactMatch) { + for (Pattern pattern : patterns) + if (pattern.matcher(converted).find()) + filteredValues.add(value); + } + } + return filteredValues; + } + + /** + * Returns a new set of values excluding any values matching filters. + *

+ * Filters may also be a file of filters. + *

+ * The converter should convert T to a unique String for each value in the set. + * + * @param values Values or files with values + * @param converter Converts values to strings + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values exluding any values matching filters + */ + public static Set excludeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { + if (values == null) + throw new NullPointerException("values cannot be null"); + if (converter == null) + throw new NullPointerException("converter cannot be null"); + if (filters == null) + throw new NullPointerException("filters cannot be null"); + + Set unpackedFilters = unpackSet(filters); + Set filteredValues = new LinkedHashSet(); + filteredValues.addAll(values); + Collection patterns = null; + if (!exactMatch) + patterns = compilePatterns(unpackedFilters); + for (T value : values) { + String converted = converter.convert(value); + if (unpackedFilters.contains(converted)) { + filteredValues.remove(value); + } else if (!exactMatch) { + for (Pattern pattern : patterns) + if (pattern.matcher(converted).find()) + filteredValues.remove(value); + } + } + return filteredValues; + } + + private static Collection compilePatterns(Collection filters) { + Collection patterns = new ArrayList(); + for (String filter: filters) { + patterns.add(Pattern.compile(filter)); + } + return patterns; + } + + protected static final StringConverter IDENTITY_STRING_CONVERTER = new StringConverter() { + @Override + public String convert(String value) { + return value; + } + }; } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java index 49e9ddf52..b7fc1bdab 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,15 +12,14 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.utils.text; @@ -48,75 +47,92 @@ import java.util.List; * For the love of god, please use this system for reading lines in a file. */ public class XReadLines implements Iterator, Iterable { - private BufferedReader in; // The stream we're reading from - private String nextline = null; // Return value of next call to next() - private boolean trimWhitespace = true; + private final BufferedReader in; // The stream we're reading from + private String nextLine = null; // Return value of next call to next() + private final boolean trimWhitespace; + private final String commentPrefix; + + public XReadLines(final File filename) throws FileNotFoundException { + this(new FileReader(filename), true, null); + } + + public XReadLines(final File filename, final boolean trimWhitespace) throws FileNotFoundException { + this(new FileReader(filename), trimWhitespace, null); + } /** * Creates a new xReadLines object to read lines from filename * - * @param filename - * @throws FileNotFoundException + * @param filename file name + * @param trimWhitespace trim whitespace + * @param commentPrefix prefix for comments or null if no prefix is set + * @throws FileNotFoundException when the file is not found */ - public XReadLines(final File filename, final boolean trimWhitespace) throws FileNotFoundException { - this(new FileReader(filename), trimWhitespace); + public XReadLines(final File filename, final boolean trimWhitespace, final String commentPrefix) throws FileNotFoundException { + this(new FileReader(filename), trimWhitespace, commentPrefix); } - public XReadLines(final File filename) throws FileNotFoundException { - this(filename, true); + public XReadLines(final InputStream inputStream) throws FileNotFoundException { + this(new InputStreamReader(inputStream), true, null); } - /** - * Creates a new xReadLines object to read lines from fileReader - * - * @param fileReader - * @throws FileNotFoundException - */ - public XReadLines(final FileReader fileReader, final boolean trimWhitespace) throws FileNotFoundException { - this(new BufferedReader(fileReader), trimWhitespace); - } - - public XReadLines(final FileReader fileReader) throws FileNotFoundException { - this(fileReader, true); + public XReadLines(final InputStream inputStream, final boolean trimWhitespace) { + this(new InputStreamReader(inputStream), trimWhitespace, null); } /** * Creates a new xReadLines object to read lines from an input stream * - * @param inputStream + * @param inputStream input stream + * @param trimWhitespace trim whitespace + * @param commentPrefix prefix for comments or null if no prefix is set */ - public XReadLines(final InputStream inputStream, final boolean trimWhitespace) { - this(new BufferedReader(new InputStreamReader(inputStream)), trimWhitespace); - } - - public XReadLines(final InputStream inputStream) throws FileNotFoundException { - this(inputStream, true); + public XReadLines(final InputStream inputStream, final boolean trimWhitespace, final String commentPrefix) { + this(new InputStreamReader(inputStream), trimWhitespace, commentPrefix); } /** - * Creates a new xReadLines object to read lines from an bufferedReader + * Creates a new xReadLines object to read lines from a reader * - * @param reader + * @param reader reader + */ + public XReadLines(final Reader reader) { + this(reader, true, null); + } + + /** + * Creates a new xReadLines object to read lines from an reader + * + * @param reader reader + * @param trimWhitespace trim whitespace */ public XReadLines(final Reader reader, final boolean trimWhitespace) { + this(reader, trimWhitespace, null); + } + + /** + * Creates a new xReadLines object to read lines from an bufferedReader + * + * @param reader file name + * @param trimWhitespace trim whitespace + * @param commentPrefix prefix for comments or null if no prefix is set + */ + public XReadLines(final Reader reader, final boolean trimWhitespace, final String commentPrefix) { + this.in = (reader instanceof BufferedReader) ? (BufferedReader)reader : new BufferedReader(reader); + this.trimWhitespace = trimWhitespace; + this.commentPrefix = commentPrefix; try { - this.in = new BufferedReader(reader); - nextline = readNextLine(); - this.trimWhitespace = trimWhitespace; + this.nextLine = readNextLine(); } catch(IOException e) { throw new IllegalArgumentException(e); } } - public XReadLines(final Reader reader) { - this(reader, true); - } - /** * Reads all of the lines in the file, and returns them as a list of strings * - * @return + * @return all of the lines in the file. */ public List readLines() { List lines = new LinkedList(); @@ -128,38 +144,48 @@ public class XReadLines implements Iterator, Iterable { /** * I'm an iterator too... - * @return + * @return an iterator */ public Iterator iterator() { return this; } public boolean hasNext() { - return nextline != null; + return this.nextLine != null; } /** - * Actually reads the next line from the stream, not accessible publically - * @return + * Actually reads the next line from the stream, not accessible publicly + * @return the next line or null + * @throws IOException if an error occurs */ private String readNextLine() throws IOException { - String nextline = in.readLine(); // Read another line - if (nextline != null && trimWhitespace ) - nextline = nextline.trim(); - return nextline; + String nextLine; + while ((nextLine = this.in.readLine()) != null) { + if (this.trimWhitespace) { + nextLine = nextLine.trim(); + if (nextLine.length() == 0) + continue; + } + if (this.commentPrefix != null) + if (nextLine.startsWith(this.commentPrefix)) + continue; + break; + } + return nextLine; } /** - * Returns the next line (minus whitespace) - * @return + * Returns the next line (optionally minus whitespace) + * @return the next line */ public String next() { try { - String result = nextline; - nextline = readNextLine(); + String result = this.nextLine; + this.nextLine = readNextLine(); // If we haven't reached EOF yet - if (nextline == null) { + if (this.nextLine == null) { in.close(); // And close on EOF } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index a6b2bbb21..d950a4541 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -223,12 +223,12 @@ public class GenotypeLikelihoods { /** * The maximum number of alleles that we can represent as genotype likelihoods */ - final static int MAX_ALLELES_THAT_CAN_BE_GENOTYPED = 50; + public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50; /* * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles */ - private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALLELES_THAT_CAN_BE_GENOTYPED); + private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) { final int numLikelihoods = calculateNumLikelihoods(1+altAlleles, 2); @@ -311,7 +311,7 @@ public class GenotypeLikelihoods { public static GenotypeLikelihoodsAllelePair getAllelePair(final int PLindex) { // make sure that we've cached enough data if ( PLindex >= PLIndexToAlleleIndex.length ) - throw new ReviewedStingException("GATK limitation: cannot genotype more than " + MAX_ALLELES_THAT_CAN_BE_GENOTYPED + " alleles"); + throw new ReviewedStingException("GATK limitation: cannot genotype more than " + MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + " alleles"); return PLIndexToAlleleIndex[PLindex]; } diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 7f5212ba3..f477fedc9 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -26,18 +26,17 @@ package org.broadinstitute.sting; import org.apache.commons.lang.StringUtils; -import org.broad.tribble.FeatureCodec; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; -import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.testng.Assert; import org.testng.annotations.BeforeMethod; @@ -315,9 +314,10 @@ public class WalkerTest extends BaseTest { // it's the type we expected System.out.println(String.format(" => %s PASSED", name)); } else { - e.printStackTrace(); - Assert.fail(String.format("Test %s expected exception %s but got %s instead", - name, expectedException, e.getClass())); + if ( e.getCause() != null ) + e.getCause().printStackTrace(System.out); // must print to stdout to see the message + Assert.fail(String.format("Test %s expected exception %s but instead got %s with error message %s", + name, expectedException, e.getClass(), e.getMessage())); } } else { // we didn't expect an exception but we got one :-( diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java index 192c86fe3..68bd28d7a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -86,13 +86,15 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------- private class EngineErrorHandlingTestProvider extends TestDataProvider { - Class expectedException; - boolean multiThreaded; + final Class expectedException; + final boolean multiThreaded; + final int iterationsToTest; public EngineErrorHandlingTestProvider(Class exceptedException, final boolean multiThreaded) { super(EngineErrorHandlingTestProvider.class); this.expectedException = exceptedException; this.multiThreaded = multiThreaded; + this.iterationsToTest = multiThreaded ? 10 : 1; setName(String.format("Engine error handling: expected %s, is-multithreaded %b", exceptedException, multiThreaded)); } } @@ -113,9 +115,11 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { // @Test(dataProvider = "EngineErrorHandlingTestProvider") public void testEngineErrorHandlingTestProvider(EngineErrorHandlingTestProvider cfg) { - final String root = "-T ErrorThrowing -R " + b37KGReference; - final String args = root + (cfg.multiThreaded ? " -nt 2" : "") + " -E " + cfg.expectedException.getSimpleName(); - WalkerTestSpec spec = new WalkerTestSpec(args, 0, cfg.expectedException); - executeTest(cfg.toString(), spec); + for ( int i = 0; i < cfg.iterationsToTest; i++ ) { + final String root = "-T ErrorThrowing -R " + b37KGReference; + final String args = root + (cfg.multiThreaded ? " -nt 2" : "") + " -E " + cfg.expectedException.getSimpleName(); + WalkerTestSpec spec = new WalkerTestSpec(args, 0, cfg.expectedException); + executeTest(cfg.toString(), spec); + } } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index ec0db12d3..5759204cf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -42,13 +42,13 @@ public class GATKReportUnitTest extends BaseTest { Assert.assertEquals(report.getTables().size(), 5); GATKReportTable countVariants = report.getTable("CountVariants"); - Object countVariantsPK = countVariants.getPrimaryKeyByData("dbsnp.eval.none.all"); + Object countVariantsPK = countVariants.getPrimaryKeyByData("CountVariants", "dbsnp", "eval", "none", "all"); Assert.assertEquals(countVariants.get(countVariantsPK, "nProcessedLoci"), "63025520"); Assert.assertEquals(countVariants.get(countVariantsPK, "nNoCalls"), "0"); Assert.assertEquals(countVariants.get(countVariantsPK, "heterozygosity"), 4.73e-06); GATKReportTable validationReport = report.getTable("ValidationReport"); - Object validationReportPK = countVariants.getPrimaryKeyByData("dbsnp.eval.none.novel"); + Object validationReportPK = countVariants.getPrimaryKeyByData("CountVariants", "dbsnp", "eval", "none", "novel"); Assert.assertEquals(validationReport.get(validationReportPK, "PPV"), Double.NaN); } @@ -79,6 +79,49 @@ public class GATKReportUnitTest extends BaseTest { Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'"); } + private GATKReportTable makeBasicTable() { + GATKReport report = GATKReport.newSimpleReport("TableName", "sample", "value"); + GATKReportTable table = report.getTable("TableName"); + report.addRow("foo.1", "hello"); + report.addRow("foo.2", "world"); + return table; + } + + @Test + public void testDottedSampleName() { + GATKReportTable table = makeBasicTable(); + Object pk; + + pk = table.getPrimaryKeyByData("foo.1"); + Assert.assertEquals(table.get(pk, "value"), "hello"); + + pk = table.getPrimaryKeyByData("foo.2"); + Assert.assertEquals(table.get(pk, "value"), "world"); + } + + @Test + public void testFindPrimaryKeyByData() { + GATKReportTable table = makeBasicTable(); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.1")); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.1", "hello")); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.2")); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.2", "world")); + Assert.assertNull(table.findPrimaryKeyByData("list", "longer", "than", "column", "count")); + Assert.assertNull(table.findPrimaryKeyByData("short")); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testEmptyFindPrimaryKeyByData() { + GATKReportTable table = makeBasicTable(); + table.findPrimaryKeyByData(); + } + + @Test(expectedExceptions = NullPointerException.class) + public void testNullFindPrimaryKeyByData() { + GATKReportTable table = makeBasicTable(); + table.findPrimaryKeyByData((Object[]) null); + } + @Test public void testSimpleGATKReport() { // Create a new simple GATK report named "TableName" with columns: Roger, is, and Awesome diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java new file mode 100755 index 000000000..d2acaa588 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java @@ -0,0 +1,20 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class FlagStatIntegrationTest extends WalkerTest { + + @Test + public void testFlagStat() { + String md5 = "9c4039662f24bfd23ccf67973cb5df29"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T FlagStat -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", + 1, + Arrays.asList(md5)); + executeTest("test flag stat", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java index 44cf87b45..7d1fc637b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java @@ -38,7 +38,7 @@ public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s", 1, - Arrays.asList("fcd581aa6befe85c7297509fa7b34edf")); + Arrays.asList("1e9e8d637d2acde23fa99fe9dc07e3e2")); executeTest("CountReadsInActiveRegions:", spec); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index 31c7a4e83..964d768c4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -94,4 +94,18 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } + + @Test + public void testLargeGLs() { + + final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0}; + GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB)); + + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); + + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); + + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0]; + Assert.assertEquals(calculatedAlleleCount, 6); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 78167e7e9..015f11048 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -122,16 +122,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test - public void testCallingParameters() { - HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "258c1b33349eb3b2d395ec4d69302725" ); - - for ( Map.Entry entry : e.entrySet() ) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 " + entry.getKey(), 1, - Arrays.asList(entry.getValue())); - executeTest(String.format("test calling parameter[%s]", entry.getKey()), spec); - } + public void testMinBaseQualityScore() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, + Arrays.asList("258c1b33349eb3b2d395ec4d69302725")); + executeTest("test min_base_quality_score 26", spec); } @Test @@ -142,6 +137,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test SLOD", spec); } + @Test + public void testNDA() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("443b2f8882393c4c65277c34cdb6060c")); + executeTest("test NDA", spec); + } + + @Test + public void testCompTrack() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("71251d8893649ea9abd5d9aa65739ba1")); + executeTest("test using comp track", spec); + } + @Test public void testOutputParameter() { HashMap e = new HashMap(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 035bf4020..1ab7b679e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -302,7 +302,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("4c00cfa0fd343fef62d19af0edeb4f65")); + 1, Arrays.asList("8d4530e9cef8531c46bbb693b84d04c7")); executeTestParallel("testSelect1", spec); } @@ -330,7 +330,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("4df6654860ad63b7e24e6bc5fbbbcb00")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bb076f7239039191fde883c5e68483ea")); executeTestParallel("testCompVsEvalAC",spec); } @@ -360,7 +360,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3b85cd0fa37539ff51d34e026f26fef2")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("9d24f34d94d74417e00e3b7bcf84650f")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -372,7 +372,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bed8751c773b9568218f78c90f13348a")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("7329b0bc73c9ccaf5facd754f3410c38")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -488,7 +488,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("9726c0c8f19d271cf680f5f16f0926b3") + Arrays.asList("aad01b26198b30da5d59a05c08d863bb") ); executeTest("testModernVCFWithLargeIndels", spec); } @@ -508,7 +508,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("c89705147ef4233d5de3a539469bd1d1") + Arrays.asList("4fa2557663ef8fb4cdeecd667791985c") ); executeTest("testStandardIndelEval", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java new file mode 100644 index 000000000..ca06ca699 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.gatk.walkers.varianteval; + + +// the imports for unit testing. + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class VariantEvalWalkerUnitTest extends BaseTest { + VariantEvalWalker VEwalker; + VariantContext eval; + + + @BeforeMethod + public void init() { + VEwalker = new VariantEvalWalker(); + eval = new VariantContextBuilder("x", "chr1", 1, 1, Collections.singleton(Allele.create("A", true))).make(); + } + + // -------------------------------------------------------------------------------- + // + // Test stratifications / evaluations + // + // -------------------------------------------------------------------------------- + + private class StratifiedEvalTestProvider extends TestDataProvider { + final List stratificationObjects = new ArrayList(); + final Set> evaluationObjects = new HashSet>(); + final List expectedCounts; + final int maxI; + + /** + * + * @param maxI test integers from 1 ... maxI + * @param expectedCounts the expected number of integers from 1 ... maxI divisible by each combination, in order, of allStates + * @param allStates all stratification tests, in order + */ + public StratifiedEvalTestProvider(int maxI, + final List expectedCounts, + final List ... allStates) { + super(StratifiedEvalTestProvider.class); + + this.maxI = maxI; + this.expectedCounts = expectedCounts; + this.evaluationObjects.add(CounterEval.class); + + String stateName = ""; + for ( List states : allStates ) { + stratificationObjects.add(new IntegerStratifier(states)); + stateName = stateName + Utils.join(",", states) + " "; + } + + setName(String.format("maxI=%d expectedCounts=%s states=%s", maxI, Utils.join(",", expectedCounts), stateName)); + } + } + + /** + * Test stratifier -> holds a list of integers, and the states are if the integer value of evalName is divisable + * by that number + */ + public static class IntegerStratifier extends VariantStratifier { + final List integers; + + private IntegerStratifier(final List integers) { + this.integers = integers; + initialize(); + } + + @Override + public void initialize() { + states.addAll(integers); + } + + @Override + public List getRelevantStates(final ReferenceContext ref, final RefMetaDataTracker tracker, final VariantContext comp, final String compName, final VariantContext eval, final String evalName, final String sampleName) { + int i = Integer.valueOf(evalName); // a terrible hack, but we can now provide accessible states + List states = new ArrayList(); + for ( int state : integers ) + if ( i % state == 0 ) + states.add(state); + return states; + } + } + + /** + * Test evaluator -> just counts the number of calls to update1 + */ + public static class CounterEval extends VariantEvaluator { + public int count = 0; + + @Override public int getComparisonOrder() { return 1; } + + @Override + public void update1(final VariantContext eval, final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + count++; + } + + @Override + public boolean supportsCombine() { + return true; + } + + @Override + public void combine(final VariantEvaluator other) { + this.count += ((CounterEval)other).count; + } + } + + private void initialize(StratifiedEvalTestProvider cfg) { + VEwalker.createStratificationStates(cfg.stratificationObjects, cfg.evaluationObjects); + + final RefMetaDataTracker tracker = new RefMetaDataTracker(); + final ReferenceContext ref = null; + final VariantContext comp = null; + final String compName = null, sampleName = null; + + // increment eval counts for each stratification of divisors of i from from 1...maxI + for ( int i = 1; i <= cfg.maxI; i++ ) { + final String evalName = String.valueOf(i); // terrible hack to stratify by divisor + for ( EvaluationContext nec : VEwalker.getEvaluationContexts(tracker, ref, eval, evalName, comp, compName, sampleName) ) { + synchronized (nec) { + nec.apply(tracker, ref, null, comp, eval); + } + } + } + } + + @DataProvider(name = "StratifiedEvalTestProvider") + public Object[][] makeStratifiedEvalTestProvider() { + + new StratifiedEvalTestProvider(4, // test 1, 2, 3, 4 + Arrays.asList(4, 2), // 4 divisible by 1, 2 by 2 + Arrays.asList(1, 2)); + + new StratifiedEvalTestProvider(6, // test 1, 2, 3, 4, 5, 6 + Arrays.asList(6, 3, 2), // 6 divisible by 1, 3 by 2, 2 by 3 + Arrays.asList(1, 2, 3)); + + // test that some states can be empty -- does this work in VE? + new StratifiedEvalTestProvider(6, + Arrays.asList(3, 2), + Arrays.asList(2, 3)); + + // test a single stratification + new StratifiedEvalTestProvider(6, + Arrays.asList(3), + Arrays.asList(2)); + + // test a meaningless state + new StratifiedEvalTestProvider(4, // test 1, 2, 3, 4 + Arrays.asList(4, 2), // 4 divisible by 1, 2 by 2 + Arrays.asList(1, 2), Arrays.asList(1)); + + // test a adding a state that divides space in half + new StratifiedEvalTestProvider(4, + Arrays.asList(2, 2), + Arrays.asList(1, 2), Arrays.asList(2)); + + // test pairs of strats + new StratifiedEvalTestProvider(12, + Arrays.asList(4, 3, 2, 3), + Arrays.asList(1, 2), Arrays.asList(3, 4)); + + return StratifiedEvalTestProvider.getTests(StratifiedEvalTestProvider.class); + } + + /** + * Ensures that counting and stratifications all are working properly by iterating + * over integers 1...cfg.N and stratify according to cfg, and that the counts in + * each bin are as expected. + * + * @param cfg + */ + @Test(dataProvider = "StratifiedEvalTestProvider") + public void testBasicOperation(StratifiedEvalTestProvider cfg) { + initialize(cfg); + checkStratificationCountsAreExpected(VEwalker.stratManager, cfg.expectedCounts); + } + + private final void checkStratificationCountsAreExpected(final StratificationManager manager, + final List expectedCounts) { + for ( int key = 0; key < manager.size(); key++ ) { + final String stratStateString = manager.getStratsAndStatesStringForKey(key); + final EvaluationContext nec = manager.get(key); + + for ( final VariantEvaluator ve : nec.getVariantEvaluators() ) { + // test for count here + final CounterEval counterEval = (CounterEval)ve; + final int expected = expectedCounts.get(key); + Assert.assertEquals(counterEval.count, expected, "Count seen of " + counterEval.count + " not expected " + expected + " at " + stratStateString); + } + } + } + + /** + * A derived test on testBasicOperation that checks that combining stratifications + * works as expected by ensuring the results are the same when the remapped + * strats are the identity map (A -> A, B -> B, etc) + */ + @Test(dataProvider = "StratifiedEvalTestProvider", dependsOnMethods = {"testBasicOperation"}) + public void testIdentityCombine(StratifiedEvalTestProvider cfg) { + for ( int i = 0; i < cfg.stratificationObjects.size(); i++ ) { + initialize(cfg); + final VariantStratifier toReplace = cfg.stratificationObjects.get(i); + final VariantStratifier newStrat = cfg.stratificationObjects.get(i); + final Map remappedStates = Utils.makeIdentityFunctionMap(newStrat.getAllStates()); + StratificationManager combined = + VEwalker.stratManager.combineStrats(toReplace, newStrat, EvaluationContext.COMBINER, remappedStates); + checkStratificationCountsAreExpected(combined, cfg.expectedCounts); + } + } + +// /** +// * A derived test on testBasicOperation that checks that combining stratifications +// * works as expected. We look into cfg, and if there are multiple states we create +// * dynamically create a combinations of the stratifications, and ensure that the +// * combined results are as we expected. +// */ +// @Test(dataProvider = "StratifiedEvalTestProvider", dependsOnMethods = {"testBasicOperation"}) +// public void testCombinedEachStrat(StratifiedEvalTestProvider cfg) { +// for ( int i = 0; i < cfg.stratificationObjects.size(); i++ ) { +// initialize(cfg); +// final VariantStratifier toReplace = cfg.stratificationObjects.get(i); +// +// // TODO -- replace this code with something that combines values in strat +// final VariantStratifier newStrat = cfg.stratificationObjects.get(i); +// final Map remappedStates = Utils.makeIdentityFunctionMap(newStrat.getAllStates()); +// final List expected = cfg.expectedCounts; +// +// StratificationManager combined = +// VEwalker.stratManager.combineStrats(toReplace, newStrat, EvaluationContext.COMBINER, remappedStates); +// checkStratificationCountsAreExpected(combined, expected); +// } +// } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java new file mode 100644 index 000000000..23bf074e2 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.R; + +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class RUtilsUnitTest { + @DataProvider(name = "stringLists") + public Object[][] getStringLists() { + return new Object[][] { + new Object[] { null, "NA" }, + new Object[] { Collections.EMPTY_LIST, "c()" }, + new Object[] { Arrays.asList("1", "2", "3"), "c('1','2','3')" } + }; + } + + @Test(dataProvider = "stringLists") + public void testToStringList(List actual, String expected) { + Assert.assertEquals(RUtils.toStringList(actual), expected); + } + + @DataProvider(name = "numberLists") + public Object[][] getNumberLists() { + return new Object[][] { + new Object[] { null, "NA" }, + new Object[] { Collections.EMPTY_LIST, "c()" }, + new Object[] { Arrays.asList(1, 2, 3), "c(1,2,3)" }, + new Object[] { Arrays.asList(1D, 2D, 3D), "c(1.0,2.0,3.0)" } + }; + } + + @Test(dataProvider = "numberLists") + public void testToNumberList(List actual, String expected) { + Assert.assertEquals(RUtils.toNumberList(actual), expected); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index 7d478d063..282f19d8a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -130,7 +130,7 @@ public class ActivityProfileUnitTest extends BaseTest { Assert.assertEquals(profile.size(), cfg.probs.size()); Assert.assertEquals(profile.isActiveList, cfg.probs); - assertRegionsAreEqual(profile.createActiveRegions(0), cfg.expectedRegions); + assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions); } private void assertRegionsAreEqual(List actual, List expected) { diff --git a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java index f0b1de6fe..f21b4bced 100644 --- a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java @@ -28,17 +28,14 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ParsingEngine; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.testng.Assert; -import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.List; - +import java.util.*; /** * Tests selected functionality in the CommandLineExecutable class @@ -74,6 +71,76 @@ public class ListFileUtilsUnitTest extends BaseTest { performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); } + @Test + public void testUnpackSet() throws Exception { + Set expected = new HashSet(Arrays.asList("public/testdata/exampleBAM.bam")); + Set actual; + + actual = ListFileUtils.unpackSet(Arrays.asList("public/testdata/exampleBAM.bam")); + Assert.assertEquals(actual, expected); + + File tempListFile = createTempListFile("testUnpackSet", + "#", + "public/testdata/exampleBAM.bam", + "#public/testdata/foo.bam", + " # public/testdata/bar.bam" + ); + actual = ListFileUtils.unpackSet(Arrays.asList(tempListFile.getAbsolutePath())); + Assert.assertEquals(actual, expected); + } + + @DataProvider(name="includeMatchingTests") + public Object[][] getIncludeMatchingTests() { + return new Object[][] { + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("a", "ab") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, asSet("a", "ab", "abc") } + }; + } + + @Test(dataProvider = "includeMatchingTests") + public void testIncludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { + Set actual = ListFileUtils.includeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); + Assert.assertEquals(actual, expected); + } + + @DataProvider(name="excludeMatchingTests") + public Object[][] getExcludeMatchingTests() { + return new Object[][] { + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, Collections.EMPTY_SET } + }; + } + + @Test(dataProvider = "excludeMatchingTests") + public void testExcludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { + Set actual = ListFileUtils.excludeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); + Assert.assertEquals(actual, expected); + } + + private static Set asSet(T... args){ + return new HashSet(Arrays.asList(args)); + } + private File createTempListFile( String tempFilePrefix, String... lines ) throws Exception { File tempListFile = File.createTempFile(tempFilePrefix, ".list"); tempListFile.deleteOnExit(); diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala new file mode 100644 index 000000000..89f2f55fb --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.qscripts.examples + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ + +/** + * Script used for testing output to /dev/null + */ +class ExampleReadFilter extends QScript { + @Input(doc="The reference file for the bam files.", shortName="R") + var referenceFile: File = _ + + @Input(doc="Bam file to genotype.", shortName="I") + var bamFile: File = _ + + def script() { + val genotyper = new UnifiedGenotyper with BadMate + genotyper.reference_sequence = referenceFile + genotyper.memoryLimit = 2 + genotyper.input_file :+= bamFile + add(genotyper) + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index 085e0b008..2f604a809 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -49,7 +49,6 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) { else IntervalUtils.parseIntervalArguments(parser, intervals) Collections.sort(parsedLocs) - Collections.unmodifiableList(parsedLocs) val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY) Collections.unmodifiableList(mergedLocs) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 70046c913..8ac711f25 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -32,6 +32,8 @@ import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor * Merges a vcf text file. */ class VcfGatherFunction extends CombineVariants with GatherFunction { + this.assumeIdenticalSamples = true + this.suppressCommandLineHeader = true private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] @@ -43,7 +45,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) } this.out = this.originalOutput - this.assumeIdenticalSamples = true // NO_HEADER and sites_only from VCFWriterArgumentTypeDescriptor // are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index 22f4f6225..9d51b01a0 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -136,7 +136,7 @@ object PipelineTest extends BaseTest with Logging { println(" value (min,target,max) table key metric") for (validation <- evalSpec.validations) { val table = report.getTable(validation.table) - val key = table.getPrimaryKeyByData(validation.key) + val key = table.getPrimaryKeyByData(validation.table +: validation.key.split('.') : _*) val value = String.valueOf(table.get(key, validation.metric)) val inRange = if (value == null) false else validation.inRange(value) val flag = if (!inRange) "*" else " " diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala new file mode 100644 index 000000000..7e5e9a93e --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.pipeline.examples + +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class ExampleReadFilterPipelineTest { + @Test + def testExampleReadFilter() { + val spec = new PipelineTestSpec + spec.name = "examplereadfilter" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -I " + BaseTest.testDir + "exampleBAM.bam").mkString + PipelineTest.executeTest(spec) + } +} From 82efd4457e184fdba5c621c4d38ebe1ffd020bb7 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 18 Apr 2012 16:35:09 -0400 Subject: [PATCH 03/17] Revert some bad merge changes --- .../gatk/walkers/indels/PairHMMIndelErrorModel.java | 2 +- .../src/org/broadinstitute/sting/utils/PairHMM.java | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index ea6d514f4..343860629 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -124,7 +124,7 @@ public class PairHMMIndelErrorModel { } - static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { + static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG // 001000012345000000 diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java index f4fcf9674..9fcb97a4d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java @@ -41,14 +41,14 @@ public class PairHMM { private static final byte DEFAULT_GCP = (byte) 10; private static final double BANDING_TOLERANCE = 22.0; private static final int BANDING_CLUSTER_WINDOW = 12; - private final boolean doBanded; + private final boolean noBanded; public PairHMM() { - doBanded = false; + noBanded = false; } - public PairHMM( final boolean doBanded ) { - this.doBanded = doBanded; + public PairHMM( final boolean noBanded ) { + this.noBanded = noBanded; } @@ -100,7 +100,7 @@ public class PairHMM { readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); } - if( doBanded ) { + if( false ) { final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step final ArrayList workToBeAdded = new ArrayList(); final ArrayList calculatedValues = new ArrayList(); From 960e7e6aaf1e1ed46d7ee6b6d59802ca8eba716c Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 18 Apr 2012 19:53:42 -0400 Subject: [PATCH 04/17] Changes to integration tests --- .../UnifiedGenotyperIntegrationTest.java | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 4d00f6113..8292b8bc9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("d3191b2f10139c969501990ffdf29082")); + Arrays.asList("9b08dc6800ba11bc6d9f6ccf392a60fe")); executeTest("test MultiSample Pilot1", spec); } @@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("7c7288170c6aadae555a44e79ca5bf19")); + Arrays.asList("d275e0f75368dbff012ea8655dce3444")); executeTest("test SingleSample Pilot2", spec); } @@ -80,7 +80,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "2158eb918abb95225ea5372fcd9c9236"; + private final static String COMPRESSED_OUTPUT_MD5 = "1e3c897794e5763a8720807686707b18"; @Test public void testCompressedOutput() { @@ -101,7 +101,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "834e85f6af4ad4a143b913dfc7defb08"; + String md5 = "06d11ed89f02f08911e100df0f7db7a4"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -200,8 +200,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "d5879f1c277035060434d79a441b31ca" ); - e.put( 1.0 / 1850, "13f80245bab2321b92d27eebd5c2fc33" ); + e.put( 0.01, "d07e5ca757fbcb1c03f652f82265c2f8" ); + e.put( 1.0 / 1850, "d1fb9186e6f39f2bcf5d0edacd8f7fe2" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -225,7 +225,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("8c134a6e0abcc70d2ed3216d5f8e0100")); + Arrays.asList("623be1fd8b63a01bfe35ac864d5199fe")); executeTest(String.format("test multiple technologies"), spec); } @@ -244,7 +244,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("34baad3177712f6cd0b476f4c578e08f")); + Arrays.asList("40ea10c0238c3be2991d31ae72476884")); executeTest(String.format("test calling with BAQ"), spec); } @@ -263,7 +263,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("4bf4f819a39a73707cae60fe30478742")); + Arrays.asList("c9b0bd900a4ec949adfbd28909581eeb")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -278,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("ae08fbd6b0618cf3ac1be763ed7b41ca")); + Arrays.asList("6b7c8691c527facf9884c2517d943f2f")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -291,7 +291,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("120600f2bfa3a47bd93b50f768f98d5b")); + Arrays.asList("d72603aa33a086d64d4dddfd2995552f")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -301,7 +301,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("2e75d2766235eab23091a67ea2947d13")); + Arrays.asList("4a59fe207949b7d043481d7c1b786573")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -311,7 +311,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("5057bd7d07111e8b1085064782eb6c80")); + Arrays.asList("a8a9ccf30bddee94bb1d300600794ee7")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -319,7 +319,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("c0f9ca3ceab90ebd38cc0eec9441d71f")); + Arrays.asList("0b388936022539530f565da14d5496d3")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( @@ -368,7 +368,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("53758e66e3a3188bd9c78d2329d41962")); + Arrays.asList("973178b97efd2daacc9e45c414275d59")); executeTest("test minIndelFraction 0.0", spec); } @@ -376,7 +376,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("3aa39b1f6f3b1eb051765f9c21f6f461")); + Arrays.asList("220facd2eb0923515d1d8ab874055564")); executeTest("test minIndelFraction 0.25", spec); } From 143e92b79790cdbed7f60b8e9ecd87f9085b3f04 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 11 Apr 2012 13:56:51 -0400 Subject: [PATCH 06/17] Rebasing --- .../genotyper/UnifiedArgumentCollection.java | 4 +- .../indels/PairHMMIndelErrorModel.java | 250 ++++++------ .../broadinstitute/sting/utils/PairHMM.java | 259 ++++++++++++ .../sting/utils/PairHMMUnitTest.java | 367 ++++++++++++++++++ 4 files changed, 761 insertions(+), 119 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/PairHMM.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index d7174536e..aa4bde0ab 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -147,11 +147,11 @@ public class UnifiedArgumentCollection { @Hidden @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty", required = false) - public double INDEL_GAP_CONTINUATION_PENALTY = 10.0; + public byte INDEL_GAP_CONTINUATION_PENALTY = 10; @Hidden @Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty", required = false) - public double INDEL_GAP_OPEN_PENALTY = 45.0; + public byte INDEL_GAP_OPEN_PENALTY = 45; @Hidden @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 890ed9e3d..171c42040 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -31,7 +31,9 @@ import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,13 +43,14 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Map; public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; private boolean DEBUG = false; - private boolean bandedLikelihoods = false; + private boolean bandedLikelihoods = true; private static final int MAX_CACHED_QUAL = 127; @@ -60,12 +63,12 @@ public class PairHMMIndelErrorModel { private static final int START_HRUN_GAP_IDX = 4; private static final int MAX_HRUN_GAP_IDX = 20; - private static final double MIN_GAP_OPEN_PENALTY = 30.0; - private static final double MIN_GAP_CONT_PENALTY = 10.0; - private static final double GAP_PENALTY_HRUN_STEP = 1.0; // each increase in hrun decreases gap penalty by this. + private static final byte MIN_GAP_OPEN_PENALTY = 30; + private static final byte MIN_GAP_CONT_PENALTY = 10; + private static final byte GAP_PENALTY_HRUN_STEP = 1; // each increase in hrun decreases gap penalty by this. - private final double[] GAP_OPEN_PROB_TABLE; - private final double[] GAP_CONT_PROB_TABLE; + private final byte[] GAP_OPEN_PROB_TABLE; + private final byte[] GAP_CONT_PROB_TABLE; ///////////////////////////// // Private Member Variables @@ -86,42 +89,42 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { this.DEBUG = deb; - this.bandedLikelihoods = bandedLikelihoods; + //this.bandedLikelihoods = bandedLikelihoods; // fill gap penalty table, affine naive model: - this.GAP_CONT_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; - this.GAP_OPEN_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; + this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; + this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - double gop = -indelGOP/10.0; - double gcp = -indelGCP/10.0; for (int i = 0; i < START_HRUN_GAP_IDX; i++) { - GAP_OPEN_PROB_TABLE[i] = gop; - GAP_CONT_PROB_TABLE[i] = gcp; + GAP_OPEN_PROB_TABLE[i] = indelGOP; + GAP_CONT_PROB_TABLE[i] = indelGCP; } double step = GAP_PENALTY_HRUN_STEP/10.0; - double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob - double maxGCP = -MIN_GAP_CONT_PENALTY/10.0; // phred to log prob + // initialize gop and gcp to their default values + byte gop = indelGOP; + byte gcp = indelGCP; + // all of the following is computed in QUal-space for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { - gop += step; - if (gop > maxGOP) - gop = maxGOP; + gop -= GAP_PENALTY_HRUN_STEP; + if (gop < MIN_GAP_OPEN_PENALTY) + gop = MIN_GAP_OPEN_PENALTY; - gcp += step; - if(gcp > maxGCP) - gcp = maxGCP; + gcp -= step; + if(gcp < MIN_GAP_CONT_PENALTY) + gcp = MIN_GAP_CONT_PENALTY; GAP_OPEN_PROB_TABLE[i] = gop; GAP_CONT_PROB_TABLE[i] = gcp; } } - static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) { + static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG // 001000012345000000 @@ -155,7 +158,7 @@ public class PairHMMIndelErrorModel { private void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, - double[] currentGOP, double[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { + byte[] currentGOP, byte[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { if (indI > 0 && indJ > 0) { final int im1 = indI -1; final int jm1 = indJ - 1; @@ -168,20 +171,20 @@ public class PairHMMIndelErrorModel { matchMetricArray[indI][indJ] = pBaseRead + MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[im1][jm1], XMetricArray[im1][jm1], YMetricArray[im1][jm1]}); - final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : -(double)currentGOP[im1]/10.0; + final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : -(double)currentGCP[im1]/10.0; XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); // update Y array - final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : -(double)currentGOP[im1]/10.0; + final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : -(double)currentGCP[im1]/10.0; YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); } } private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, - double[] currentGOP, double[] currentGCP, int indToStart, + byte[] currentGOP, byte[] currentGCP, int indToStart, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { final int X_METRIC_LENGTH = readBases.length+1; @@ -349,8 +352,9 @@ public class PairHMMIndelErrorModel { } - private void fillGapProbabilities(int[] hrunProfile, - double[] contextLogGapOpenProbabilities, double[] contextLogGapContinuationProbabilities) { + private void fillGapProbabilities(final int[] hrunProfile, + final byte[] contextLogGapOpenProbabilities, + final byte[] contextLogGapContinuationProbabilities) { // fill based on lookup table for (int i = 0; i < hrunProfile.length; i++) { if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) { @@ -372,27 +376,8 @@ public class PairHMMIndelErrorModel { final int readCounts[] = new int[pileup.getNumberOfElements()]; int readIdx=0; - LinkedHashMap gapOpenProbabilityMap = new LinkedHashMap(); - LinkedHashMap gapContProbabilityMap = new LinkedHashMap(); - - // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes. - // todo -- refactor into separate function - for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - byte[] haplotypeBases = haplotype.getBases(); - double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length]; - double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length]; - - // get homopolymer length profile for current haplotype - int[] hrunProfile = new int[haplotypeBases.length]; - getContextHomopolymerLength(haplotypeBases,hrunProfile); - fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - - gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities); - gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities); - - } + PairHMM pairHMM = new PairHMM(bandedLikelihoods); for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations readCounts[readIdx] = p.getRepresentativeCount(); @@ -408,12 +393,27 @@ public class PairHMMIndelErrorModel { else { // System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read.isEmpty()) continue; - if(ReadUtils.is454Read(read)) { + if (read.getUnclippedEnd() > ref.getWindow().getStop()) + read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop()); + + if (read.isEmpty()) continue; - } + + if (read.getUnclippedStart() < ref.getWindow().getStart()) + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart()); + + if (read.isEmpty()) + continue; + // hard-clip low quality ends - this may introduce extra H elements in CIGAR string + read = ReadClipper.hardClipLowQualEnds(read,(byte)BASE_QUAL_THRESHOLD ); + + if (read.isEmpty()) + continue; + // get bases of candidate haplotypes that overlap with reads final int trailingBases = 3; @@ -469,54 +469,56 @@ public class PairHMMIndelErrorModel { unclippedReadBases = read.getReadBases(); unclippedReadQuals = read.getBaseQualities(); - // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, - // and may leave a string of Q2 bases still hanging off the reads. - for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) { - if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) - numStartClippedBases++; - else - break; + final int extraOffset = Math.abs(eventLength); - } - for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){ - if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) - numEndClippedBases++; - else - break; - } + /** + * Compute genomic locations that candidate haplotypes will span. + * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord, + * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above. + * We will propose haplotypes that overlap the read with some padding. + * True read start = readStart + numStartClippedBases - ReadUtils.getFirstInsertionOffset(read) + * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart. + * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to + * differentiate context between two haplotypes + */ + long startLocationInRefForHaplotypes = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); + long stopLocationInRefForHaplotypes = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; - int extraOffset = Math.abs(eventLength); + if (DEBUG) + System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); - long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); - long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; - - // Variables start and stop are coordinates (inclusive) where we want to get the haplotype from. int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; // check if start of read will be before start of reference context - if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut - start = ref.getWindow().getStart(); - + if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { + // read starts before haplotype: read will have to be cut + //numStartClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; + startLocationInRefForHaplotypes = ref.getWindow().getStart(); + } // check also if end of read will go beyond reference context - if (stop > ref.getWindow().getStop()) - stop = ref.getWindow().getStop(); + if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { + //numEndClippedBases += stopLocationInRefForHaplotypes - ref.getWindow().getStop(); + stopLocationInRefForHaplotypes = ref.getWindow().getStop(); + } - // if there's an insertion in the read, the read stop position will be less than start + read length, + // if there's an insertion in the read, the read stop position will be less than start + read legnth, // but we want to compute likelihoods in the whole region that a read might overlap - if (stop <= start + readLength) { - stop = start + readLength-1; + if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) { + stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; } // ok, we now figured out total number of clipped bases on both ends. // Figure out where we want to place the haplotype to score read against - /* - if (DEBUG) - System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); - */ + + if (DEBUG) + System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); LinkedHashMap readEl = new LinkedHashMap(); + /** + * Check if we'll end up with an empty read once all clipping is done + */ if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) { int j=0; for (Allele a: haplotypeMap.keySet()) { @@ -537,67 +539,81 @@ public class PairHMMIndelErrorModel { // initialize path metric and traceback memories for likelihood computation double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; - double[] previousGOP = null; - double[] previousGCP = null; - int startIdx; + int startIndexInHaplotype = 0; + final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; + final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; + + // get homopolymer length profile for current haplotype + int[] hrunProfile = new int[readBases.length]; + getContextHomopolymerLength(readBases,hrunProfile); + fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); + + for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - if (stop > haplotype.getStopPosition()) - stop = haplotype.getStopPosition(); - if (start < haplotype.getStartPosition()) - start = haplotype.getStartPosition(); + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); - // cut haplotype bases - long indStart = start - haplotype.getStartPosition(); - long indStop = stop - haplotype.getStartPosition(); + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); double readLikelihood; if (DEBUG) System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength(), read.getCigar().toString()); + indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); + if (indStart < 0 || indStop >= haplotype.getBases().length || indStart > indStop) { // read spanned more than allowed reference context: we currently can't deal with this - readLikelihood =0; + throw new ReviewedStingException("BUG! bad read clipping"); +// readLikelihood =0; } else { final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - if (matchMetricArray == null) { - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; + final int X_METRIC_LENGTH = readBases.length+1; + final int Y_METRIC_LENGTH = haplotypeBases.length+1; + if (matchMetricArray == null) { + //no need to reallocate arrays for each new haplotype, as length won't change matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + } - final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); - final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); + + pairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + + /* if (previousHaplotypeSeen == null) - startIdx = 0; - else { - final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); - final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP); - final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP); - startIdx = Math.min(Math.min(s1, s2), s3); - } + startIndexInHaplotype = 0; + else + startIndexInHaplotype = 0; //computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); + previousHaplotypeSeen = haplotypeBases.clone(); - previousGOP = currentContextGOP.clone(); - previousGCP = currentContextGCP.clone(); + */ + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + contextLogGapOpenProbabilities, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities, + startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); + /* double r2 = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, contextLogGapOpenProbabilities, + contextLogGapContinuationProbabilities, 0, matchMetricArray, XMetricArray, YMetricArray); - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, - currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray); - - if (DEBUG) { + if (readLikelihood > 0) { + int k=0; + } + */ if (DEBUG) { System.out.println("H:"+new String(haplotypeBases)); System.out.println("R:"+new String(readBases)); System.out.format("L:%4.2f\n",readLikelihood); - System.out.format("StPos:%d\n", startIdx); + // System.out.format("Lorig:%4.2f\n",r2); + System.out.format("StPos:%d\n", startIndexInHaplotype); } } readEl.put(a,readLikelihood); diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java new file mode 100644 index 000000000..58bed2795 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.*; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class PairHMM { + private static final int MAX_CACHED_QUAL = (int)Byte.MAX_VALUE; + private static final byte DEFAULT_GOP = (byte) 45; + private static final byte DEFAULT_GCP = (byte) 10; + private static final double BANDING_TOLERANCE = 22.0; + private static final int BANDING_CLUSTER_WINDOW = 12; + private final boolean doBanded; + + public PairHMM() { + doBanded = false; + } + + public PairHMM( final boolean doBanded ) { + this.doBanded = doBanded; + } + + + public void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, + final int X_METRIC_LENGTH) { + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + + } + + @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability + public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, + final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // initial arrays to hold the probabilities of being in the match, insertion and deletion cases + final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + + return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); + } + + @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability + public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, + final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + if( doBanded ) { + final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step + final ArrayList workToBeAdded = new ArrayList(); + final ArrayList calculatedValues = new ArrayList(); + final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; + workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype + + for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over + //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order + int el = 1; + for( int work : workQueue ) { + // choose the appropriate diagonal baseline location + int iii = 0; + int jjj = diag; + if( diag > Y_METRIC_LENGTH ) { + iii = diag - Y_METRIC_LENGTH; + jjj = Y_METRIC_LENGTH; + } + // move to the starting work location along the diagonal + iii += work; + jjj -= work; + while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { + iii--; + jjj++; + work--; + } + if( !detectClusteredStartLocations(workToBeAdded, work ) ) { + workToBeAdded.add(work); // keep this thread going once it has started + } + + if( work >= el - 3 ) { + // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value + double maxElement = Double.NEGATIVE_INFINITY; + for( el = work; el < numDiags + 1; el++ ) { + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, + insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); + final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); + calculatedValues.add(bestMetric); + if( bestMetric > maxElement ) { + maxElement = bestMetric; + } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { + break; + } + if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix + break; + } + if( --jjj <= 0 ) { // don't walk off the edge of the matrix + break; + } + } + + // find a local maximum to start a new band in the work queue + double localMaxElement = Double.NEGATIVE_INFINITY; + int localMaxElementIndex = 0; + for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { + final double bestMetric = calculatedValues.get(kkk); + if( bestMetric > localMaxElement ) { + localMaxElement = bestMetric; + localMaxElementIndex = kkk; + } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum + if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { + workToBeAdded.add( work + localMaxElementIndex ); + } + break; + } + } + calculatedValues.clear(); + + // reset iii and jjj to the appropriate diagonal baseline location + iii = 0; + jjj = diag; + if( diag > Y_METRIC_LENGTH ) { + iii = diag - Y_METRIC_LENGTH; + jjj = Y_METRIC_LENGTH; + } + // move to the starting work location along the diagonal + iii += work-1; + jjj -= work-1; + + // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value + for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, + insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); + final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); + if( bestMetric > maxElement ) { + maxElement = bestMetric; + } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { + break; + } + } + } + } + workQueue.clear(); + workQueue.addAll(workToBeAdded); + workToBeAdded.clear(); + } + } else { + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } + + // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other + private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { + for(int x : list) { + if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { + return true; + } + } + return false; + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java new file mode 100644 index 000000000..22bcb1bbf --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class PairHMMUnitTest extends BaseTest { + final static boolean EXTENSIVE_TESTING = true; + PairHMM hmm = new PairHMM( false ); // reference implementation + PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private class BasicLikelihoodTestProvider extends TestDataProvider { + final String ref, read; + final byte[] refBasesWithContext, readBasesWithContext; + final int baseQual, insQual, delQual, gcp; + final int expectedQual; + final static String CONTEXT = "ACGTAATGACGATTGCA"; + final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; + final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; + + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + } + + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + super(BasicLikelihoodTestProvider.class, String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); + this.baseQual = baseQual; + this.delQual = delQual; + this.insQual = insQual; + this.gcp = gcp; + this.read = read; + this.ref = ref; + this.expectedQual = expectedQual; + + refBasesWithContext = asBytes(ref, left, right); + readBasesWithContext = asBytes(read, false, false); + } + + public double expectedLogL() { + return expectedQual / -10.0; + } + + public double tolerance() { + return 0.1; // TODO FIXME arbitrary + } + + public double calcLogL() { + + double logL = hmm.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), + qualAsBytes(gcp, false)); + + return logL; + } + + private final byte[] asBytes(final String bases, final boolean left, final boolean right) { + return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + } + + private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { + final byte phredQuals[] = new byte[readBasesWithContext.length]; + // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM + Arrays.fill(phredQuals, (byte)100); + + // update just the bases corresponding to the provided micro read with the quality scores + if( doGOP ) { + phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + } else { + for ( int i = 0; i < read.length(); i++) + phredQuals[i + CONTEXT.length()] = (byte)phredQual; + } + + return phredQuals; + } + } + + final Random random = new Random(87865573); + private class BandedLikelihoodTestProvider extends TestDataProvider { + final String ref, read; + final byte[] refBasesWithContext, readBasesWithContext; + final int baseQual, insQual, delQual, gcp; + final int expectedQual; + final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC"; + final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA"; + final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT"; + final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA"; + final byte[] baseQuals, insQuals, delQuals, gcps; + + public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + } + + public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); + this.baseQual = baseQual; + this.delQual = delQual; + this.insQual = insQual; + this.gcp = gcp; + this.read = read; + this.ref = ref; + this.expectedQual = expectedQual; + + refBasesWithContext = asBytes(ref, left, right); + readBasesWithContext = asBytes(read, false, false); + baseQuals = qualAsBytes(baseQual); + insQuals = qualAsBytes(insQual); + delQuals = qualAsBytes(delQual); + gcps = qualAsBytes(gcp, false); + } + + public double expectedLogL() { + double logL = hmm.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + baseQuals, insQuals, delQuals, gcps); + + return logL; + } + + public double tolerance() { + return 0.2; // TODO FIXME arbitrary + } + + public double calcLogL() { + + double logL = bandedHMM.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + baseQuals, insQuals, delQuals, gcps); + + return logL; + } + + private final byte[] asBytes(final String bases, final boolean left, final boolean right) { + return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + } + + private byte[] qualAsBytes(final int phredQual) { + return qualAsBytes(phredQual, true); + } + + private byte[] qualAsBytes(final int phredQual, final boolean addRandom) { + final byte phredQuals[] = new byte[readBasesWithContext.length]; + Arrays.fill(phredQuals, (byte)phredQual); + if(addRandom) { + for( int iii = 0; iii < phredQuals.length; iii++) { + phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3)); + } + } + return phredQuals; + } + } + + @DataProvider(name = "BasicLikelihoodTestProvider") + public Object[][] makeBasicLikelihoodTests() { + // context on either side is ACGTTGCA REF ACGTTGCA + // test all combinations + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + + for ( final int baseQual : baseQuals ) { + for ( final int indelQual : indelQuals ) { + for ( final int gcp : gcps ) { + + // test substitutions + for ( final byte refBase : BaseUtils.BASES ) { + for ( final byte readBase : BaseUtils.BASES ) { + final String ref = new String(new byte[]{refBase}); + final String read = new String(new byte[]{readBase}); + final int expected = refBase == readBase ? 0 : baseQual; + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + } + } + + // test insertions and deletions + for ( final int size : sizes ) { + for ( final byte base : BaseUtils.BASES ) { + final int expected = indelQual + (size - 2) * gcp; + + for ( boolean insertionP : Arrays.asList(true, false)) { + final String small = Utils.dupString((char)base, 1); + final String big = Utils.dupString((char)base, size); + + final String ref = insertionP ? small : big; + final String read = insertionP ? big : small; + + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + } + } + } + } + } + } + + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); + } + + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { + double calculatedLogL = cfg.calcLogL(); + double expectedLogL = cfg.expectedLogL(); + logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + } + + @DataProvider(name = "BandedLikelihoodTestProvider") + public Object[][] makeBandedLikelihoodTests() { + // context on either side is ACGTTGCA REF ACGTTGCA + // test all combinations + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + + for ( final int baseQual : baseQuals ) { + for ( final int indelQual : indelQuals ) { + for ( final int gcp : gcps ) { + + // test substitutions + for ( final byte refBase : BaseUtils.BASES ) { + for ( final byte readBase : BaseUtils.BASES ) { + final String ref = new String(new byte[]{refBase}); + final String read = new String(new byte[]{readBase}); + final int expected = refBase == readBase ? 0 : baseQual; + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + } + } + + // test insertions and deletions + for ( final int size : sizes ) { + for ( final byte base : BaseUtils.BASES ) { + final int expected = indelQual + (size - 2) * gcp; + + for ( boolean insertionP : Arrays.asList(true, false)) { + final String small = Utils.dupString((char)base, 1); + final String big = Utils.dupString((char)base, size); + + final String ref = insertionP ? small : big; + final String read = insertionP ? big : small; + + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + } + } + } + } + } + } + + return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); + } + + @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) + public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { + double calculatedLogL = cfg.calcLogL(); + double expectedLogL = cfg.expectedLogL(); + logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + } + + @Test + public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() { + byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); + + final int offset = 2; + byte[] gop = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(gop, (byte) 80); + byte[] gcp = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(gcp, (byte) 80); + + for( int k = 0; k < haplotype1.length - 2 * offset; k++ ) { + byte[] quals = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(quals, (byte) 90); + // one read mismatches the haplotype + quals[k] = 20; + + byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); + // change single base at position k to C. If it's a C, change to T + mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); + double res1 = hmm.computeReadLikelihoodGivenHaplotype( + haplotype1, mread, + quals, gop, gop, + gcp); + + + System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); + + Assert.assertEquals(res1, -2.0, 1e-2); + } + } + + @Test + public void testMismatchInEveryPositionInTheRead() { + byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); + + final int offset = 2; + byte[] gop = new byte[haplotype1.length - offset]; + Arrays.fill(gop, (byte) 80); + byte[] gcp = new byte[haplotype1.length - offset]; + Arrays.fill(gcp, (byte) 80); + + for( int k = 0; k < haplotype1.length - offset; k++ ) { + byte[] quals = new byte[haplotype1.length - offset]; + Arrays.fill(quals, (byte) 90); + // one read mismatches the haplotype + quals[k] = 20; + + byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); + // change single base at position k to C. If it's a C, change to T + mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); + double res1 = hmm.computeReadLikelihoodGivenHaplotype( + haplotype1, mread, + quals, gop, gop, + gcp); + + + System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); + + Assert.assertEquals(res1, -2.0, 1e-2); + } + } +} \ No newline at end of file From 76a6e37f4f374e20a40e72199dd38a26348ff5b9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 19 Apr 2012 11:45:56 -0400 Subject: [PATCH 07/17] Don't output callability metrics by default anymore; one can still have them output to the 'metrics' file (which is now @Hidden because they are really for GSA use). Added a TODO to move UG from @By reference to reads and rods once LIBS is cleaned up. --- .../gatk/walkers/genotyper/UnifiedGenotyper.java | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 9036e3a62..3cec931d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -116,6 +116,8 @@ import java.util.*; @ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) +// TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources: +// TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} ) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatibleWalker { @@ -155,6 +157,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif @Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false) protected PrintStream verboseWriter = null; + @Hidden @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false) protected PrintStream metricsWriter = null; @@ -347,14 +350,6 @@ public class UnifiedGenotyper extends LocusWalker, Unif } public void onTraversalDone(UGStatistics sum) { - logger.info(String.format("Visited bases %d", sum.nBasesVisited)); - logger.info(String.format("Callable bases %d", sum.nBasesCallable)); - logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently)); - logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll())); - logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll())); - logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable())); - logger.info(String.format("Actual calls made %d", sum.nCallsMade)); - if ( metricsWriter != null ) { metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited)); metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable)); From 02ff930f6a854b0704a882860b243216e7d7cf92 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 19 Apr 2012 12:45:18 -0400 Subject: [PATCH 08/17] My changes --- .../indels/PairHMMIndelErrorModel.java | 49 +++++++++++++------ .../broadinstitute/sting/utils/PairHMM.java | 12 ++--- .../UnifiedGenotyperIntegrationTest.java | 36 +++++++------- 3 files changed, 57 insertions(+), 40 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 171c42040..5ac8b981e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -43,14 +43,13 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; -import java.util.Map; public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; private boolean DEBUG = false; - private boolean bandedLikelihoods = true; + private boolean bandedLikelihoods = false; private static final int MAX_CACHED_QUAL = 127; @@ -157,7 +156,7 @@ public class PairHMMIndelErrorModel { } - private void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, + private static void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, byte[] currentGOP, byte[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { if (indI > 0 && indJ > 0) { final int im1 = indI -1; @@ -183,9 +182,27 @@ public class PairHMMIndelErrorModel { } } - private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, + public static double computeReadLikehoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, + byte[] currentGOP, byte[] currentGCP, boolean bandedLikelihoods) { + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions + final int X_METRIC_LENGTH = readBases.length + 1; + final int Y_METRIC_LENGTH = haplotypeBases.length + 1; + + // initial arrays to hold the probabilities of being in the match, insertion and deletion cases + final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + + return computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentGOP, + currentGCP, 0, matchMetricArray, XMetricArray, YMetricArray, bandedLikelihoods); + + } + private static double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, byte[] currentGOP, byte[] currentGCP, int indToStart, - double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { + double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray, + boolean bandedLikelihoods) { final int X_METRIC_LENGTH = readBases.length+1; final int Y_METRIC_LENGTH = haplotypeBases.length+1; @@ -391,6 +408,9 @@ public class PairHMMIndelErrorModel { } } else { + if (DEBUG) { + System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); + } // System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); @@ -577,8 +597,8 @@ public class PairHMMIndelErrorModel { final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; + final int X_METRIC_LENGTH = readBases.length+2; + final int Y_METRIC_LENGTH = haplotypeBases.length+2; if (matchMetricArray == null) { //no need to reallocate arrays for each new haplotype, as length won't change @@ -588,7 +608,7 @@ public class PairHMMIndelErrorModel { } - pairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); /* if (previousHaplotypeSeen == null) @@ -602,17 +622,14 @@ public class PairHMMIndelErrorModel { contextLogGapOpenProbabilities, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities, startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); - /* double r2 = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, contextLogGapOpenProbabilities, - contextLogGapContinuationProbabilities, 0, matchMetricArray, XMetricArray, YMetricArray); - - if (readLikelihood > 0) { - int k=0; - } - */ if (DEBUG) { +/* double l2 = computeReadLikehoodGivenHaplotype(haplotypeBases, readBases, readQuals, contextLogGapOpenProbabilities, + contextLogGapContinuationProbabilities, bandedLikelihoods); + */ + if (DEBUG) { System.out.println("H:"+new String(haplotypeBases)); System.out.println("R:"+new String(readBases)); System.out.format("L:%4.2f\n",readLikelihood); - // System.out.format("Lorig:%4.2f\n",r2); + // System.out.format("Lorig:%4.2f\n",r2); System.out.format("StPos:%d\n", startIndexInHaplotype); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java index 58bed2795..9fcb97a4d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java @@ -41,18 +41,18 @@ public class PairHMM { private static final byte DEFAULT_GCP = (byte) 10; private static final double BANDING_TOLERANCE = 22.0; private static final int BANDING_CLUSTER_WINDOW = 12; - private final boolean doBanded; + private final boolean noBanded; public PairHMM() { - doBanded = false; + noBanded = false; } - public PairHMM( final boolean doBanded ) { - this.doBanded = doBanded; + public PairHMM( final boolean noBanded ) { + this.noBanded = noBanded; } - public void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, + public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, final int X_METRIC_LENGTH) { for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { @@ -100,7 +100,7 @@ public class PairHMM { readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); } - if( doBanded ) { + if( false ) { final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step final ArrayList workToBeAdded = new ArrayList(); final ArrayList calculatedValues = new ArrayList(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 4d00f6113..067e9088c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("d3191b2f10139c969501990ffdf29082")); + Arrays.asList("9b08dc6800ba11bc6d9f6ccf392a60fe")); executeTest("test MultiSample Pilot1", spec); } @@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("7c7288170c6aadae555a44e79ca5bf19")); + Arrays.asList("d275e0f75368dbff012ea8655dce3444")); executeTest("test SingleSample Pilot2", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("c956f0ea0e5f002288a09f4bc4af1319")); + Arrays.asList("e948543b83bfd0640fcb994d72f8e234")); executeTest("test Multiple SNP alleles", spec); } @@ -80,7 +80,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "2158eb918abb95225ea5372fcd9c9236"; + private final static String COMPRESSED_OUTPUT_MD5 = "1e3c897794e5763a8720807686707b18"; @Test public void testCompressedOutput() { @@ -101,7 +101,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "834e85f6af4ad4a143b913dfc7defb08"; + String md5 = "06d11ed89f02f08911e100df0f7db7a4"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -200,8 +200,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "d5879f1c277035060434d79a441b31ca" ); - e.put( 1.0 / 1850, "13f80245bab2321b92d27eebd5c2fc33" ); + e.put( 0.01, "d07e5ca757fbcb1c03f652f82265c2f8" ); + e.put( 1.0 / 1850, "d1fb9186e6f39f2bcf5d0edacd8f7fe2" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -225,7 +225,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("8c134a6e0abcc70d2ed3216d5f8e0100")); + Arrays.asList("623be1fd8b63a01bfe35ac864d5199fe")); executeTest(String.format("test multiple technologies"), spec); } @@ -244,7 +244,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("34baad3177712f6cd0b476f4c578e08f")); + Arrays.asList("40ea10c0238c3be2991d31ae72476884")); executeTest(String.format("test calling with BAQ"), spec); } @@ -263,7 +263,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("4bf4f819a39a73707cae60fe30478742")); + Arrays.asList("c9b0bd900a4ec949adfbd28909581eeb")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -278,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("ae08fbd6b0618cf3ac1be763ed7b41ca")); + Arrays.asList("6b7c8691c527facf9884c2517d943f2f")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -291,7 +291,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("120600f2bfa3a47bd93b50f768f98d5b")); + Arrays.asList("d72603aa33a086d64d4dddfd2995552f")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -301,7 +301,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("2e75d2766235eab23091a67ea2947d13")); + Arrays.asList("4a59fe207949b7d043481d7c1b786573")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -311,7 +311,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("5057bd7d07111e8b1085064782eb6c80")); + Arrays.asList("a8a9ccf30bddee94bb1d300600794ee7")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -319,13 +319,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("c0f9ca3ceab90ebd38cc0eec9441d71f")); + Arrays.asList("0b388936022539530f565da14d5496d3")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("0240f34e71f137518be233c9890a5349")); + Arrays.asList("537dd9b4174dc356fb13d8d3098ad602")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -368,7 +368,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("53758e66e3a3188bd9c78d2329d41962")); + Arrays.asList("973178b97efd2daacc9e45c414275d59")); executeTest("test minIndelFraction 0.0", spec); } @@ -376,7 +376,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("3aa39b1f6f3b1eb051765f9c21f6f461")); + Arrays.asList("220facd2eb0923515d1d8ab874055564")); executeTest("test minIndelFraction 0.25", spec); } From 79272c5e1523f1fd11fca1cce7f8617890cde3d3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 19 Apr 2012 12:48:09 -0400 Subject: [PATCH 09/17] Thanks to Menachem for pointing out that the docs for genotyping_mode and output_mode were the same (and unclear). Fixed. --- .../gatk/walkers/genotyper/UnifiedArgumentCollection.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index d7174536e..f4ffbad91 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -55,13 +55,10 @@ public class UnifiedArgumentCollection { @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) public Double PCR_error = DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.DEFAULT_PCR_ERROR_RATE; - /** - * Specifies how to determine the alternate allele to use for genotyping - */ - @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) + @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) + @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; /** From df5dd841af8e1912824d235379907fc349f47b93 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Thu, 19 Apr 2012 16:08:55 -0400 Subject: [PATCH 11/17] AC strat now checks if evals will be merged before throwing an error on multiple eval files. Minor tweaks to WGP script based on new recal VCF format. --- .../stratifications/AlleleCount.java | 4 +- .../VariantEvalIntegrationTest.java | 77 +++++++++++++++---- 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 072962436..7a3b85567 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -20,10 +20,8 @@ import java.util.*; public class AlleleCount extends VariantStratifier { @Override public void initialize() { - List> evals = getVariantEvalWalker().getEvals(); - // we can only work with a single eval VCF, and it must have genotypes - if ( evals.size() != 1 ) + if ( getVariantEvalWalker().getEvals().size() != 1 && !getVariantEvalWalker().mergeEvals ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf"); // There are 2 x n sample chromosomes for diploids diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 1ab7b679e..71c014f2c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -34,6 +34,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf"; + private static String fundamentalTestSNPsSplit1of2VCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.split_1_of_2.vcf"; + private static String fundamentalTestSNPsSplit2of2VCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.split_2_of_2.vcf"; private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.NA12045.vcf"; private static String cmdRoot = "-T VariantEval" + @@ -437,24 +439,69 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testAlleleCountStrat() { WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T VariantEval", - "-R " + b37KGReference, - "--dbsnp " + b37dbSNP132, - "--eval " + fundamentalTestSNPsVCF, - "-noEV", - "-EV CountVariants", - "-noST", - "-ST AlleleCount", - "-L " + fundamentalTestSNPsVCF, - "-o %s" - ), - 1, - Arrays.asList("1198bfea6183bd43219071a84c79a386") - ); + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsVCF, + "-noEV", + "-EV CountVariants", + "-noST", + "-ST AlleleCount", + "-L " + fundamentalTestSNPsVCF, + "-o %s" + ), + 1, + Arrays.asList("1198bfea6183bd43219071a84c79a386") + ); executeTest("testAlleleCountStrat", spec); } + @Test + public void testMultipleEvalTracksAlleleCountWithMerge() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsSplit1of2VCF, + "--eval " + fundamentalTestSNPsSplit2of2VCF, + "--mergeEvals", + "-noEV", + "-EV CountVariants", + "-noST", + "-ST AlleleCount", + "-L " + fundamentalTestSNPsVCF, + "-o %s" + ), + 1, + Arrays.asList("1198bfea6183bd43219071a84c79a386") + ); + executeTest("testMultipleEvalTracksAlleleCountWithMerge", spec); + } + + @Test + public void testMultipleEvalTracksAlleleCountWithoutMerge() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsSplit1of2VCF, + "--eval " + fundamentalTestSNPsSplit2of2VCF, + //"--mergeEvals", No merge with AC strat ==> error + "-noEV", + "-EV CountVariants", + "-noST", + "-ST AlleleCount", + "-L " + fundamentalTestSNPsVCF + ), + 0, + UserException.class + ); + executeTest("testMultipleEvalTracksAlleleCountWithoutMerge", spec); + } + @Test public void testIntervalStrat() { WalkerTestSpec spec = new WalkerTestSpec( From 0f8c77391d67407844f28e82f63b5556bdd81f80 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 19 Apr 2012 12:21:12 -0400 Subject: [PATCH 12/17] BQSR bug triage #3 * fixed context covariate famous "off by one" error * reduced maximum quality score to Q50 (following Eric/Ryan's suggestion) * remove context downsampling in BQSR R script --- .../gatk/walkers/bqsr/ContextCovariate.java | 4 ++-- .../sting/gatk/walkers/bqsr/Datum.java | 2 +- .../gatk/walkers/bqsr/RecalDataManager.java | 3 +-- .../sting/gatk/walkers/bqsr/RecalDatum.java | 4 ++-- .../sting/utils/QualityUtils.java | 1 + .../recalibration/BaseRecalibration.java | 9 ++++--- .../bqsr/ContextCovariateUnitTest.java | 4 ++-- .../BaseRecalibrationUnitTest.java | 24 ++++++++----------- 8 files changed, 23 insertions(+), 28 deletions(-) mode change 100755 => 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index c7c281943..c5aabc64d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -125,8 +125,8 @@ public class ContextCovariate implements StandardCovariate { */ private BitSet contextWith(byte[] bases, int offset, int contextSize) { BitSet result = null; - if (offset >= contextSize) { - String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); + if (offset - contextSize + 1 >= 0) { + String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1)); if (!context.contains("N")) result = BitSetUtils.bitSetFrom(context); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java old mode 100755 new mode 100644 index b3ea88d58..77e4cc8c7 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java @@ -76,7 +76,7 @@ public class Datum { final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - return Math.min(empiricalQual, (double) QualityUtils.MAX_QUAL_SCORE); + return Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } byte empiricalQualByte() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index cedff0a80..64dba0551 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -31,7 +31,6 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.collections.Pair; @@ -152,7 +151,7 @@ public class RecalDataManager { ArrayList optionalCovariatesToAdd = new ArrayList(); // initialize an empty array of optional covariates to create the first few tables for (Covariate covariate : requiredCovariates) { requiredCovariatesToAdd.add(covariate); - final Map recalTable = new HashMap(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively) + final Map recalTable = new HashMap(); // initializing a new recal table for each required covariate (cumulatively) final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index c71a00a3a..2dac90252 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -74,7 +74,7 @@ public class RecalDatum extends Datum { } public final void calcCombinedEmpiricalQuality() { - this.empiricalQuality = empiricalQualDouble(); // cache the value so we don't call log over and over again + this.empiricalQuality = empiricalQualDouble(); // cache the value so we don't call log over and over again } public final void calcEstimatedReportedQuality() { @@ -102,7 +102,7 @@ public class RecalDatum extends Datum { @Override public String toString() { - return String.format("%d,%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()), (byte) Math.floor(getEstimatedQReported())); + return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality())); } diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index b5aa2598e..f53b439da 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,6 +9,7 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { + public final static byte MAX_RECALIBRATED_Q_SCORE = 50; public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 70eb9426b..d85fb03cd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -68,9 +68,9 @@ public class BaseRecalibration { /** * This constructor only exists for testing purposes. * - * @param quantizationInfo - * @param keysAndTablesMap - * @param requestedCovariates + * @param quantizationInfo the quantization info object + * @param keysAndTablesMap the map of key managers and recalibration tables + * @param requestedCovariates the list of requested covariates */ protected BaseRecalibration(QuantizationInfo quantizationInfo, LinkedHashMap> keysAndTablesMap, ArrayList requestedCovariates) { this.quantizationInfo = quantizationInfo; @@ -179,9 +179,8 @@ public class BaseRecalibration { } double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula - recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_QUAL_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL + recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL - return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java index 2b4cb2ae3..4b384aac0 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -39,8 +39,8 @@ public class ContextCovariateUnitTest { private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) { for (int i = 0; i < values.length; i++) { String expectedContext = null; - if (i >= contextSize) { - String context = bases.substring(i - contextSize, i); + if (i - contextSize + 1 >= 0) { + String context = bases.substring(i - contextSize + 1, i + 1); if (!context.contains("N")) expectedContext = context; } diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java index 4f0d39991..1193b0aea 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java @@ -24,10 +24,6 @@ public class BaseRecalibrationUnitTest { private org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager dataManager; private LinkedHashMap> keysAndTablesMap; - private BQSRKeyManager rgKeyManager; - private BQSRKeyManager qsKeyManager; - private BQSRKeyManager cvKeyManager; - private ReadGroupCovariate rgCovariate; private QualityScoreCovariate qsCovariate; private ContextCovariate cxCovariate; @@ -60,13 +56,13 @@ public class BaseRecalibrationUnitTest { rgCovariate = new ReadGroupCovariate(); rgCovariate.initialize(RAC); requiredCovariates.add(rgCovariate); - rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); keysAndTablesMap.put(rgKeyManager, new HashMap()); qsCovariate = new QualityScoreCovariate(); qsCovariate.initialize(RAC); requiredCovariates.add(qsCovariate); - qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); keysAndTablesMap.put(qsKeyManager, new HashMap()); cxCovariate = new ContextCovariate(); @@ -75,7 +71,7 @@ public class BaseRecalibrationUnitTest { cyCovariate = new CycleCovariate(); cyCovariate.initialize(RAC); optionalCovariates.add(cyCovariate); - cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); keysAndTablesMap.put(cvKeyManager, new HashMap()); @@ -108,7 +104,7 @@ public class BaseRecalibrationUnitTest { updateCovariateWithKeySet(mapEntry.getValue(), key, newDatum); } } - dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_QUAL_SCORE); + dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_RECALIBRATED_Q_SCORE); List quantizedQuals = new ArrayList(); List qualCounts = new ArrayList(); @@ -179,7 +175,7 @@ public class BaseRecalibrationUnitTest { BitSet key = entry.getKey(); RecalDatum datum = entry.getValue(); List keySet = keyManager.keySetFrom(key); - System.out.println(String.format("%s => %s", Utils.join(",", keySet), datum)); + System.out.println(String.format("%s => %s", Utils.join(",", keySet), datum) + "," + datum.getEstimatedQReported()); } System.out.println(); } @@ -187,9 +183,9 @@ public class BaseRecalibrationUnitTest { } - private static void printNestedHashMap(Map table, String output) { + private static void printNestedHashMap(Map table, String output) { for (Object key : table.keySet()) { - String ret = ""; + String ret; if (output.isEmpty()) ret = "" + key; else @@ -199,7 +195,7 @@ public class BaseRecalibrationUnitTest { if (next instanceof org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) System.out.println(ret + " => " + next); else - printNestedHashMap((Map) next, "" + ret); + printNestedHashMap((Map) next, "" + ret); } } @@ -269,7 +265,7 @@ public class BaseRecalibrationUnitTest { } final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_QUAL_SCORE); + return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // Verbose printouts used to validate with old recalibrator //if(key.contains(null)) { @@ -289,6 +285,6 @@ public class BaseRecalibrationUnitTest { final double doubleMismatches = (double) (errors + smoothing); final double doubleObservations = (double) ( observations + smoothing ); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - return Math.min(QualityUtils.MAX_QUAL_SCORE, empiricalQual); + return Math.min(QualityUtils.MAX_RECALIBRATED_Q_SCORE, empiricalQual); } } From c44c7b9a97858f9d077b8f3295f71256c0688f00 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 19 Apr 2012 19:39:43 -0400 Subject: [PATCH 13/17] Restored optimization in Pair HMM only to compute HMM matrices starting in index where haplotypes start to diverge - saves about 15-20% of runtime which is what we lost by disabling banding in latest version, so runtime should be now about the same as what it was before refactoring. Output is bit-true to previous commit --- .../indels/PairHMMIndelErrorModel.java | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 5f7730011..bcb9ea591 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -347,7 +347,6 @@ public class PairHMMIndelErrorModel { // initialize path metric and traceback memories for likelihood computation double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; - int startIndexInHaplotype = 0; final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; @@ -376,12 +375,7 @@ public class PairHMMIndelErrorModel { indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - if (indStart < 0 || indStop >= haplotype.getBases().length || indStart > indStop) { - // read spanned more than allowed reference context: we currently can't deal with this - throw new ReviewedStingException("BUG! bad read clipping"); -// readLikelihood =0; - } else - { + final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); @@ -394,28 +388,26 @@ public class PairHMMIndelErrorModel { XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); } - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + int startIndexInHaplotype = 0; + if (previousHaplotypeSeen != null) + startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); + previousHaplotypeSeen = haplotypeBases.clone(); readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, contextLogGapOpenProbabilities, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities, startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); - previousHaplotypeSeen = haplotypeBases.clone(); - - -/* double l2 = computeReadLikehoodGivenHaplotype(haplotypeBases, readBases, readQuals, contextLogGapOpenProbabilities, - contextLogGapContinuationProbabilities, bandedLikelihoods); - */ + if (DEBUG) { System.out.println("H:"+new String(haplotypeBases)); System.out.println("R:"+new String(readBases)); System.out.format("L:%4.2f\n",readLikelihood); - // System.out.format("Lorig:%4.2f\n",r2); System.out.format("StPos:%d\n", startIndexInHaplotype); } - } readEl.put(a,readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } From de68363c23d0a58a2ce8babf9fa5ef3010239bbe Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 20 Apr 2012 10:58:34 -0400 Subject: [PATCH 15/17] Removed experimental feature (aka hack) that was meant for 1000G consensus but remained in VQSR data manager - QD was being scaled by indel length. There's no evidence any more that QD is length-dependent, neither in CEU trio data nor in latest 1000G P2 calls --- .../walkers/variantrecalibration/VariantDataManager.java | 8 -------- .../VariantRecalibrationWalkersIntegrationTest.java | 6 +++--- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e2d1692d0..3778cffb8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -241,14 +241,6 @@ public class VariantDataManager { value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - if (vc.isIndel() && annotationKey.equalsIgnoreCase("QD")) { - // normalize QD by event length for indel case - int eventLength = Math.abs(vc.getAlternateAllele(0).getBaseString().length() - vc.getReference().getBaseString().length()); // ignore multi-allelic complication here for now - if (eventLength > 0) { // sanity check - value /= (double)eventLength; - } - } - if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } } catch( Exception e ) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 11e093a6c..879a5bfa3 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -73,9 +73,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { } VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf", - "6d7ee4cb651c8b666e4a4523363caaff", // tranches - "ee5b408c8434a594496118875690c438", // recal file - "5d7e07d8813db96ba3f3dfe4737f83d1"); // cut VCF + "da4458d05f6396f5c4ab96f274e5ccdc", // tranches + "cf380d9b0ae04c8918be8425f82035b4", // recal file + "b00e5e5a6807df8ed1682317948e8a6d"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createData2() {