From ed465cd2a506a623a0efbcc5d8477ebeb925726a Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Mon, 26 Aug 2013 17:33:17 -0400 Subject: [PATCH 01/77] Fixed a few typos and clarified some doc points. --- .../VariantDataManager.java | 2 +- .../VariantRecalibrator.java | 31 ++++++++++++------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 9752ab5f5..65b1c2322 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -264,7 +264,7 @@ public class VariantDataManager { Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); final int numToAdd = minimumNumber - trainingData.size(); if( numToAdd > data.size() ) { - throw new UserException.BadInput( "Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. One can attempt to lower the --numBadVariants arugment but this is unsafe." ); + throw new UserException.BadInput( "Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. You can try lowering the --numBadVariants argument but this is unsafe." ); } int index = 0, numAdded = 0; while( numAdded < numToAdd && index < data.size() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 1c56f7fff..5a8debc72 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -79,14 +79,14 @@ import java.util.*; * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. * *

- * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker. + * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker. *

* *

* The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. - * One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. + * You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship - * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the the probability that a SNP is a true genetic + * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the @@ -94,12 +94,7 @@ import java.util.*; * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. *

* - *

- * NOTE: In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). - * See http://www.r-project.org for more info on how to download and install R. - *

- * - *

Input

+ *

Inputs

*

* The input raw variants to be recalibrated. *

@@ -127,6 +122,17 @@ import java.util.*; * -rscriptFile path/to/output.plots.R * * + *

Caveat

+ * + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @@ -155,7 +161,7 @@ public class VariantRecalibrator extends RodWalker> resource = Collections.emptyList(); @@ -175,7 +181,8 @@ public class VariantRecalibrator extends RodWalker Date: Tue, 11 Jun 2013 17:04:19 -0400 Subject: [PATCH 02/77] Add Array Logless PairHMM A new PairHMM implementation for read/haplotype likelihood calculations. Output is the same as the LOGLESS_CACHING version. Instead of allocating an entire (read x haplotype) matrix for each HMM state, this version stores sub-computations in 1D arrays. It also accesses intersections of the (read x haplotype) alignment in a different order, proceeding over "diagonals" if we think of the alignment as a matrix. This implementation makes use of haplotype caching. Because arrays are overwritten, it has to explicitly store mid-process information. Knowing where to capture this info requires us to look ahead at the subsequent haplotype to be analyzed. This necessitated a signature change in the primary method for all pairHMM implementations. We also had to adjust the classes that employ the pairHMM: LikelihoodCalculationEngine (used by HaplotypeCaller) PairHMMIndelErrorModel (used by indel genotyping classes) Made the array version the default in the HaplotypeCaller and the UnifiedArgumentCollection. The latter affects classes: ErrorModel GeneralPloidyIndelGenotypeLikelihoodsCalculationModel IndelGenotypeLikelihoodsCalculationModel ... all of which use the pairHMM via PairHMMIndelErrorModel --- .../genotyper/UnifiedArgumentCollection.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 2 +- .../LikelihoodCalculationEngine.java | 11 +- .../indels/PairHMMIndelErrorModel.java | 58 ++- .../utils/pairhmm/ArrayLoglessPairHMM.java | 359 ++++++++++++++++++ .../sting/utils/pairhmm/CnyPairHMM.java | 3 +- .../sting/utils/pairhmm/LoglessPairHMM.java | 3 +- .../sting/utils/pairhmm/PairHMMTestData.java | 58 ++- .../sting/utils/pairhmm/PairHMMUnitTest.java | 113 ++++-- .../sting/utils/pairhmm/Log10PairHMM.java | 3 +- .../sting/utils/pairhmm/N2MemoryPairHMM.java | 4 - .../sting/utils/pairhmm/PairHMM.java | 25 +- 12 files changed, 560 insertions(+), 81 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 4fae3d6e3..ff6bc5407 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -85,7 +85,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection * The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. */ @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false) - public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ARRAY_LOGLESS; /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 0b95ed07e..7edf55fed 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -387,7 +387,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Hidden @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) - public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ARRAY_LOGLESS; @Hidden @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index e6e76ba90..0d55797bc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -58,6 +58,7 @@ import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator; +import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM; @@ -98,8 +99,13 @@ public class LikelihoodCalculationEngine { return new LoglessPairHMM(); else return new CnyPairHMM(); + case ARRAY_LOGLESS: + if (noFpga || !CnyPairHMM.isAvailable()) + return new ArrayLoglessPairHMM(); + else + return new CnyPairHMM(); default: - throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING."); + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, LOGLESS_CACHING, and ARRAY_LOGLESS."); } } }; @@ -258,9 +264,10 @@ public class LikelihoodCalculationEngine { // iterate over all haplotypes, calculating the likelihood of the read for each haplotype for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { final Haplotype haplotype = haplotypes.get(jjj); + final byte[] nextHaplotypeBases = (jjj == numHaplotypes - 1) ? null : haplotypes.get(jjj+1).getBases(); final boolean isFirstHaplotype = jjj == 0; final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype); + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); if ( WRITE_LIKELIHOODS_TO_FILE ) { likelihoodsStream.printf("%s %s %s %s %s %s %f%n", diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index c77557da6..3c6e409b9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -53,6 +53,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.PairHMM; @@ -120,8 +121,11 @@ public class PairHMMIndelErrorModel { case LOGLESS_CACHING: pairHMM = new LoglessPairHMM(); break; + case ARRAY_LOGLESS: + pairHMM = new ArrayLoglessPairHMM(); + break; default: - throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT or LOGLESS_CACHING."); + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT, LOGLESS_CACHING, or ARRAY_LOGLESS."); } // fill gap penalty table, affine naive model: @@ -365,7 +369,10 @@ public class PairHMMIndelErrorModel { baseDeletionQualities = contextLogGapOpenProbabilities; } + byte[] currentHaplotypeBases = null; boolean firstHap = true; + double readLikelihood; + Allele currentAllele = null; for (Allele a: haplotypeMap.keySet()) { Haplotype haplotype = haplotypeMap.get(a); @@ -381,34 +388,65 @@ public class PairHMMIndelErrorModel { final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); - double readLikelihood; + if (DEBUG) System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); + // peak at the next haplotype in the list + final byte[] nextHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); + // process the current haplotype in the list + if (currentHaplotypeBases != null) { + // it's possible that the indel starts at the last base of the haplotypes + if ( currentHaplotypeBases.length == 0 ) { + readLikelihood = -Double.MAX_VALUE; + } else { + if (firstHap) { + //no need to reallocate arrays for each new haplotype, as length won't change + pairHMM.initialize(readBases.length, currentHaplotypeBases.length); + firstHap = false; + } + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals, + baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, nextHaplotypeBases); + } + + if (DEBUG) { + System.out.println("H:"+new String(currentHaplotypeBases)); + System.out.println("R:"+new String(readBases)); + System.out.format("L:%4.2f\n",readLikelihood); + } + + perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood); + readLikelihoods[readIdx][j++] = readLikelihood; + } + // update the current haplotype + currentHaplotypeBases = nextHaplotypeBases; + currentAllele = a; + } + // process the final haplotype + if (currentHaplotypeBases != null) { // it's possible that the indel starts at the last base of the haplotypes - if ( haplotypeBases.length == 0 ) { + if ( currentHaplotypeBases.length == 0 ) { readLikelihood = -Double.MAX_VALUE; } else { if (firstHap) { //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(readBases.length, haplotypeBases.length); + pairHMM.initialize(readBases.length, currentHaplotypeBases.length); firstHap = false; } - - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap); + // there is no next haplotype, so pass null for nextHaplotypeBases. + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals, + baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, null); } if (DEBUG) { - System.out.println("H:"+new String(haplotypeBases)); + System.out.println("H:"+new String(currentHaplotypeBases)); System.out.println("R:"+new String(readBases)); System.out.format("L:%4.2f\n",readLikelihood); } - perReadAlleleLikelihoodMap.add(p, a, readLikelihood); + perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } } diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java new file mode 100644 index 000000000..26eb745bd --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java @@ -0,0 +1,359 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: bradt + * Date: 6/11/13 + */ +public class ArrayLoglessPairHMM extends PairHMM { + private static final double INITIAL_CONDITION = Math.pow(2, 1020); + private static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); + + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected static final double TRISTATE_CORRECTION = 3.0; + + private static final int matchToMatch = 0; + private static final int indelToMatch = 1; + private static final int matchToInsertion = 2; + private static final int insertionToInsertion = 3; + private static final int matchToDeletion = 4; + private static final int deletionToDeletion = 5; + + protected double[][] transition = null; // The transition probabilities cache + protected double[][] prior = null; // The prior probabilities cache + + // Array declarations for arrays implementation + private double[] currentMatchArray = null; + private double[] currentDeleteArray = null; + private double[] currentInsertArray = null; + private double[] parentMatchArray = null; + private double[] parentDeleteArray = null; + private double[] parentInsertArray = null; + private double[] grandparentMatchArray = null; + private double[] grandparentDeleteArray = null; + private double[] grandparentInsertArray = null; + + // When successive haplotypes have a common prefix, these arrays store cached info from the previous haplotype; for reading + private double[] matchCacheArray = null; + private double[] deleteCacheArray = null; + private double[] insertCacheArray = null; + + // These arrays store cache info for use with the next haplotype; for writing + private double[] nextMatchCacheArray = null; + private double[] nextDeleteCacheArray = null; + private double[] nextInsertCacheArray = null; + + // Used when caching to store our intermediate sum at point of first difference bw successive haplotypes + private double partialSum; + + + /** + * {@inheritDoc} + */ + @Override + public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); + + transition = new double[paddedMaxReadLength][6]; + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + // Initialize all arrays + // Final Cell of array is a padding cell, initialized to zero. + currentMatchArray = new double[paddedMaxReadLength]; + currentDeleteArray = new double[paddedMaxReadLength]; + currentInsertArray = new double[paddedMaxReadLength]; + + parentMatchArray = new double[paddedMaxReadLength]; + parentDeleteArray = new double[paddedMaxReadLength]; + parentInsertArray = new double[paddedMaxReadLength]; + + grandparentMatchArray = new double[paddedMaxReadLength]; + grandparentDeleteArray = new double[paddedMaxReadLength]; + grandparentInsertArray = new double[paddedMaxReadLength]; + + // Initialize the special arrays used for caching when successive haplotypes have a common prefix + matchCacheArray = new double[paddedMaxReadLength]; + deleteCacheArray = new double[paddedMaxReadLength]; + insertCacheArray = new double[paddedMaxReadLength]; + + nextMatchCacheArray = new double[paddedMaxReadLength]; + nextDeleteCacheArray = new double[paddedMaxReadLength]; + nextInsertCacheArray = new double [paddedMaxReadLength]; + } + + + /** + * {@inheritDoc} + */ + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex) { + + if ( ! constantsAreInitialized) { + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + + // note that we initialized the constants + constantsAreInitialized = true; + } + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + + // Some housekeeping to be done if we are starting a new read + if (recacheReadValues) { + hapStartIndex = 0; + + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + // note that we initialized the constants + constantsAreInitialized = true; + + // Pad the ends of the Match and Insert arrays with 0. Analogous to the first row in the Match, Insert matrices of N2MemoryPairHMM + grandparentMatchArray[readBases.length] = 0; + grandparentInsertArray[readBases.length] = 0; + parentMatchArray[readBases.length] = 0; + parentInsertArray[readBases.length] = 0; + currentMatchArray[readBases.length] = 0; + currentInsertArray[readBases.length] = 0; + matchCacheArray[readBases.length] = 0; + insertCacheArray[readBases.length] = 0; + nextMatchCacheArray[readBases.length] = 0; + nextInsertCacheArray[readBases.length] = 0; + } + // if we have not cached from prev haplotype, clear any info we may have accumulated in a previous HMM iteration + if (hapStartIndex == 0) { + Arrays.fill(matchCacheArray, 0, readBases.length, 0); + Arrays.fill(deleteCacheArray, 0, readBases.length, 0); + Arrays.fill(insertCacheArray, 0, readBases.length, 0); + + partialSum = 0; + + // Padding value for the deletion arrays. Let's us have free deletions at the beginning + // Needs to be reset when starting a new read or when hap length changes (ie when hapStartIndex is 0) + final double initialValue = INITIAL_CONDITION / haplotypeBases.length; + // Pad the deletion arrays. Akin to padding the first row in the deletion matrix + parentDeleteArray[readBases.length] = initialValue; + grandparentDeleteArray[readBases.length] = initialValue; + currentDeleteArray[readBases.length] = initialValue; + deleteCacheArray[readBases.length] = initialValue; + nextDeleteCacheArray[readBases.length] = initialValue; + } + // We build up our solution by looking at position [0] in the match, insert arrays. Need to set to 0 before we start. + grandparentMatchArray[0] = 0; + grandparentInsertArray[0] = 0; + parentMatchArray[0] = 0; + parentInsertArray[0] = 0; + currentMatchArray[0] = 0; + currentInsertArray[0] = 0; + + // Array implementation. Start by initializing some array parameters + // Number of diagonals for a matrix = rows + cols - 1; + final int maxDiagonals = readBases.length + haplotypeBases.length - hapStartIndex - 1; + // The array indices we want to fill will be between these values + int startFill; + int endFill; + // The position of the arrays to be updated + int arrayIndex; + // The coordinate in our priors and transition matrices corresponding to a given position in the read/haplotype alignment + int matrixRow; + int matrixCol; + // The final answer prior to log10 correction + double finalArraySumProbabilities = partialSum; + // This array will contain the partial sum to cache for the next haplotype + final int cacheSumIndex = nextHapStartIndex - hapStartIndex + readBases.length - 1; + + // Perform dynamic programming using arrays, as if over diagonals of a hypothetical read/haplotype alignment matrix + for (int i = 1; i <= maxDiagonals; i++) { + // set the bounds for cells we wish to fill in the arrays + startFill = Math.max(readBases.length - i, 0); + endFill = Math.min(maxDiagonals - i + 1, readBases.length); + + // apply any previously cached array information + if (i <= readBases.length) { + // apply caching info necessary for calculating current DELETE array values + parentMatchArray[startFill] = matchCacheArray[startFill]; + parentDeleteArray[startFill] = deleteCacheArray[startFill]; + // apply caching info necessary for calculating current MATCH array values + grandparentMatchArray[startFill + 1] = matchCacheArray[startFill + 1]; + grandparentDeleteArray[startFill + 1] = deleteCacheArray[startFill + 1]; + grandparentInsertArray[startFill + 1] = insertCacheArray[startFill + 1]; + } + + // fill in the cells for our arrays + for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) { + + // translate the array position into a row, column in the priors and transition matrices + matrixRow = readBases.length - arrayIndex - 1; + matrixCol = i - matrixRow - 1 + hapStartIndex; + + // update cell for each of our new arrays. Prior, transition matrices are padded +1 row,col + updateArrayCell(arrayIndex, prior[matrixRow+1][matrixCol+1], transition[matrixRow+1]); + + // Set up caching for the next haplotype + // At the position of the final similar base between this haplotype and the next one, remember the mid-array values + if (matrixCol == nextHapStartIndex - 1) { + nextMatchCacheArray[arrayIndex] = currentMatchArray[arrayIndex]; + nextDeleteCacheArray[arrayIndex] = currentDeleteArray[arrayIndex]; + nextInsertCacheArray[arrayIndex] = currentInsertArray[arrayIndex]; + } + } + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I arrays, because they're no longer extended. + // Where i > readBases.length, array[0] corresponds to bottom row of a [read] x [haplotype] matrix. Before this, they carries the 0's we set above. + finalArraySumProbabilities += currentInsertArray[0] + currentMatchArray[0]; + + // Partial sum for caching the next haplotype: + // At the position of the last similar base between this haplotype and the next one... + // ...remember the partial sum, so that we can start here on the next hap. + if (i == cacheSumIndex) + partialSum = finalArraySumProbabilities; + + // rotate array references + double[] tempMatchArray = grandparentMatchArray; + double[] tempDeleteArray = grandparentDeleteArray; + double[] tempInsertArray = grandparentInsertArray; + + grandparentMatchArray = parentMatchArray; + grandparentDeleteArray = parentDeleteArray; + grandparentInsertArray = parentInsertArray; + + parentMatchArray = currentMatchArray; + parentDeleteArray = currentDeleteArray; + parentInsertArray = currentInsertArray; + + currentMatchArray = tempMatchArray; + currentDeleteArray = tempDeleteArray; + currentInsertArray = tempInsertArray; + } + // The cache arrays we wrote for this haplotype will be read for the next haplotype. + matchCacheArray = nextMatchCacheArray.clone(); + deleteCacheArray = nextDeleteCacheArray.clone(); + insertCacheArray = nextInsertCacheArray.clone(); + + //return result + return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + for (int i = 0; i < insertionGOP.length; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP); + transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]); + transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]); + transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]); + transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]); + transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]); + } + } + + /** + * Updates a cell in the HMM arrays + * + * @param indK index in the arrays to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + private void updateArrayCell( final int indK, final double prior, final double[] transition) { + currentMatchArray[indK] = prior * ( grandparentMatchArray[indK + 1] * transition[matchToMatch] + + grandparentInsertArray[indK + 1] * transition[indelToMatch] + + grandparentDeleteArray[indK + 1] * transition[indelToMatch] ); + currentInsertArray[indK] = parentMatchArray[indK + 1] * transition[matchToInsertion] + parentInsertArray[indK + 1] * transition[insertionToInsertion]; + currentDeleteArray[indK] = parentMatchArray[indK] * transition[matchToDeletion] + parentDeleteArray[indK] * transition[deletionToDeletion]; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java index 0afd4afe2..d92b918ba 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java @@ -191,7 +191,8 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final boolean recacheReadValues ) { + final boolean recacheReadValues, + final int nextHapStartIndex) { return 0.0; } diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index 49148c152..e745ca1f5 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -92,7 +92,8 @@ public final class LoglessPairHMM extends N2MemoryPairHMM { final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final boolean recacheReadValues ) { + final boolean recacheReadValues, + final int nextHapStartIndex) { if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { final double initialValue = INITIAL_CONDITION / haplotypeBases.length; diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java index 3d8137ecf..7fef514d9 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java @@ -67,12 +67,14 @@ import java.util.zip.GZIPInputStream; */ public class PairHMMTestData { public final String ref; + public final String nextRef; private final String read; public final byte[] baseQuals, insQuals, delQuals, gcp; public final double log10l; - PairHMMTestData(String ref, String read, byte[] baseQuals, byte[] insQuals, byte[] delQuals, byte[] gcp, double log10l) { + PairHMMTestData(String ref, String nextRef, String read, byte[] baseQuals, byte[] insQuals, byte[] delQuals, byte[] gcp, double log10l) { this.ref = ref; + this.nextRef = nextRef; this.read = read; this.baseQuals = baseQuals; this.insQuals = insQuals; @@ -81,8 +83,9 @@ public class PairHMMTestData { this.log10l = log10l; } - PairHMMTestData(String ref, String read, final byte qual) { + PairHMMTestData(String ref, String nextRef, String read, final byte qual) { this.ref = ref; + this.nextRef = nextRef; this.read = read; this.baseQuals = this.insQuals = this.delQuals = Utils.dupBytes(qual, read.length()); this.gcp = Utils.dupBytes((byte)10, read.length()); @@ -92,13 +95,14 @@ public class PairHMMTestData { public double runHMM(final PairHMM hmm) { hmm.initialize(getRead().length(), ref.length()); return hmm.computeReadLikelihoodGivenHaplotypeLog10(ref.getBytes(), getRead().getBytes(), - baseQuals, insQuals, delQuals, gcp, true); + baseQuals, insQuals, delQuals, gcp, true, null); } @Override public String toString() { return "Info{" + "ref='" + ref + '\'' + + ", nextRef=" + nextRef + '\'' + ", read='" + getRead() + '\'' + ", log10l=" + log10l + '}'; @@ -115,7 +119,7 @@ public class PairHMMTestData { hmm.initialize(first.getRead().length(), maxHaplotypeLen); for ( final PairHMMTestData datum : data ) { hmm.computeReadLikelihoodGivenHaplotypeLog10(datum.ref.getBytes(), datum.getRead().getBytes(), - datum.baseQuals, datum.insQuals, datum.delQuals, datum.gcp, false); + datum.baseQuals, datum.insQuals, datum.delQuals, datum.gcp, false, datum.nextRef.getBytes()); } } @@ -136,22 +140,44 @@ public class PairHMMTestData { in = new GZIPInputStream(in); } + String[] nextEntry; + String[] thisEntry = null; for ( final String line : new XReadLines(in) ) { - final String[] parts = line.split(" "); - final PairHMMTestData info = new PairHMMTestData( - parts[0], parts[1], - SAMUtils.fastqToPhred(parts[2]), - SAMUtils.fastqToPhred(parts[3]), - SAMUtils.fastqToPhred(parts[4]), - SAMUtils.fastqToPhred(parts[5]), - Double.parseDouble(parts[6])); + // peak at the next entry (to get the haplotype bases) + nextEntry = line.split(" "); + // process the current entry + if (thisEntry != null) { + final PairHMMTestData info = new PairHMMTestData( + thisEntry[0], nextEntry[0], thisEntry[1], + SAMUtils.fastqToPhred(thisEntry[2]), + SAMUtils.fastqToPhred(thisEntry[3]), + SAMUtils.fastqToPhred(thisEntry[4]), + SAMUtils.fastqToPhred(thisEntry[5]), + Double.parseDouble(thisEntry[6])); - if ( ! results.containsKey(info.read) ) { - results.put(info.read, new LinkedList()); + if ( ! results.containsKey(info.read) ) { + results.put(info.read, new LinkedList()); + } + final List byHap = results.get(info.read); + byHap.add(info); } - final List byHap = results.get(info.read); - byHap.add(info); + // update the current entry + thisEntry = nextEntry; } + // process the final entry + final PairHMMTestData info = new PairHMMTestData( + thisEntry[0], null, thisEntry[1], + SAMUtils.fastqToPhred(thisEntry[2]), + SAMUtils.fastqToPhred(thisEntry[3]), + SAMUtils.fastqToPhred(thisEntry[4]), + SAMUtils.fastqToPhred(thisEntry[5]), + Double.parseDouble(thisEntry[6])); + + if ( ! results.containsKey(info.read) ) { + results.put(info.read, new LinkedList()); + } + final List byHap = results.get(info.read); + byHap.add(info); return results; } diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 7334eec3f..b235f95c8 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -72,16 +72,18 @@ public class PairHMMUnitTest extends BaseTest { final N2MemoryPairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation final N2MemoryPairHMM originalHMM = new Log10PairHMM(false); // the reference implementation final N2MemoryPairHMM loglessHMM = new LoglessPairHMM(); + final PairHMM arrayHMM = new ArrayLoglessPairHMM(); @BeforeClass public void initialize() { exactHMM.doNotUseTristateCorrection(); originalHMM.doNotUseTristateCorrection(); loglessHMM.doNotUseTristateCorrection(); + arrayHMM.doNotUseTristateCorrection(); } - private List getHMMs() { - return Arrays.asList(exactHMM, originalHMM, loglessHMM); + private List getHMMs() { + return Arrays.asList(exactHMM, originalHMM, loglessHMM, arrayHMM); } // -------------------------------------------------------------------------------- @@ -91,8 +93,8 @@ public class PairHMMUnitTest extends BaseTest { // -------------------------------------------------------------------------------- private class BasicLikelihoodTestProvider { - final String ref, read; - final byte[] refBasesWithContext, readBasesWithContext; + final String ref, nextRef, read; + final byte[] refBasesWithContext, nextRefBasesWithContext, readBasesWithContext; final int baseQual, insQual, delQual, gcp; final int expectedQual; final boolean left, right; @@ -100,28 +102,30 @@ public class PairHMMUnitTest extends BaseTest { final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; - public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) { - this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + public BasicLikelihoodTestProvider(final String ref, final String nextRef, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) { + this(ref, nextRef, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); } - public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + public BasicLikelihoodTestProvider(final String ref, final String nextRef, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { this.baseQual = baseQual; this.delQual = delQual; this.insQual = insQual; this.gcp = gcp; this.read = read; this.ref = ref; + this.nextRef = nextRef; this.expectedQual = expectedQual; this.left = left; this.right = right; refBasesWithContext = asBytes(ref, left, right); + nextRefBasesWithContext = asBytes(nextRef, left, right); readBasesWithContext = asBytes(read, false, false); } @Override public String toString() { - return String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual); + return String.format("ref=%s nextRef=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, nextRef, read, baseQual, insQual, delQual, gcp, left, right, expectedQual); } public double expectedLogL() { @@ -129,7 +133,7 @@ public class PairHMMUnitTest extends BaseTest { } public double getTolerance(final PairHMM hmm) { - if ( hmm instanceof LoglessPairHMM) + if ( hmm instanceof LoglessPairHMM || hmm instanceof ArrayLoglessPairHMM) return toleranceFromExact(); if ( hmm instanceof Log10PairHMM ) { return ((Log10PairHMM)hmm).isDoingExactLog10Calculations() ? toleranceFromExact() : toleranceFromReference(); @@ -154,11 +158,14 @@ public class PairHMMUnitTest extends BaseTest { return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( refBasesWithContext, readBasesWithContext, qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), - qualAsBytes(gcp, false, anchorIndel), true); + qualAsBytes(gcp, false, anchorIndel), true, nextRefBasesWithContext); } private byte[] asBytes(final String bases, final boolean left, final boolean right) { - return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + if(bases == null) + return null; + else + return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); } private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) { @@ -204,7 +211,8 @@ public class PairHMMUnitTest extends BaseTest { final String ref = new String(new byte[]{refBase}); final String read = new String(new byte[]{readBase}); final int expected = refBase == readBase ? 0 : baseQual; - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp)}); + // runBasicLikelihoodTests uses calcLogL(), which runs HMM with recacheReads=true. Since we will not cache, should pass null in place of a nextRef + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp)}); } } @@ -220,10 +228,11 @@ public class PairHMMUnitTest extends BaseTest { final String ref = insertionP ? small : big; final String read = insertionP ? big : small; - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp)}); - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false)}); - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true)}); - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true)}); + // runBasicLikelihoodTests uses calcLogL(), which runs HMM with recacheReads=true. Since we will not cache, should pass null in place of a nextRef + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp)}); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp, true, false)}); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp, false, true)}); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp, true, true)}); } } } @@ -261,7 +270,8 @@ public class PairHMMUnitTest extends BaseTest { for ( final boolean leftFlank : Arrays.asList(true, false) ) for ( final boolean rightFlank : Arrays.asList(true, false) ) - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, leftFlank, rightFlank)}); + // runOptimizedLikelihoodTests uses calcLogL(), which runs HMM with recacheReads=true. Since we will not cache, should pass null in place of a nextRef + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, -0, gcp, leftFlank, rightFlank)}); } } } @@ -302,6 +312,7 @@ public class PairHMMUnitTest extends BaseTest { } } + @Test(enabled = !DEBUG) public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() { final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); @@ -323,7 +334,7 @@ public class PairHMMUnitTest extends BaseTest { final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false); + final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, true, null); final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual)); Assert.assertEquals(res1, expected, 1e-2); } @@ -351,7 +362,7 @@ public class PairHMMUnitTest extends BaseTest { final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false); + final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, true , null); final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual)); Assert.assertEquals(res1, expected, 1e-2); } @@ -382,11 +393,12 @@ public class PairHMMUnitTest extends BaseTest { final byte delQual = 37; final byte gcp = 10; hmm.initialize(readBases.length, refBases.length); + // running HMM with no haplotype caching. Should therefore pass null in place of nextRef bases final double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); Assert.assertTrue(d <= 0.0, "Likelihoods should be <= 0 but got "+ d); } @@ -399,11 +411,12 @@ public class PairHMMUnitTest extends BaseTest { final byte delQual = 100; final byte gcp = 100; hmm.initialize(readBases.length, refBases.length); + // running HMM with no haplotype caching. Should therefore pass null in place of nextRef bases double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); double expected = 0; final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length; if (readBases.length < refBases.length) { @@ -445,11 +458,12 @@ public class PairHMMUnitTest extends BaseTest { final byte delQual = 40; final byte gcp = 10; hmm.initialize(readBases.length, refBases.length); + // running HMM with no haplotype caching. Should therefore pass null in place of nextRef bases hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); } @Test(enabled = !DEBUG) @@ -460,20 +474,27 @@ public class PairHMMUnitTest extends BaseTest { final byte insQual = 40; final byte delQual = 40; final byte gcp = 10; - + // running HMMs with no haplotype caching. Should therefore pass null in place of nextRef bases exactHMM.initialize(readBases.length, refBases.length); exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); loglessHMM.initialize(readBases.length, refBases.length); loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); + + arrayHMM.initialize(readBases.length, refBases.length); + arrayHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, + Utils.dupBytes(baseQual, readBases.length), + Utils.dupBytes(insQual, readBases.length), + Utils.dupBytes(delQual, readBases.length), + Utils.dupBytes(gcp, readBases.length), true, null); } @DataProvider(name = "JustHMMProvider") @@ -498,7 +519,8 @@ public class PairHMMUnitTest extends BaseTest { final byte[] gcp = Utils.dupBytes((byte) 10, delQual.length); hmm.initialize(readBases.length + 100, refBases.length + 100); for ( int nExtraMaxSize = 0; nExtraMaxSize < 100; nExtraMaxSize++ ) { - hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, quals, insQual, delQual, gcp, true); + // running HMM with no haplotype caching. Should therefore pass null in place of nextRef bases + hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, quals, insQual, delQual, gcp, true, null); } } @@ -506,10 +528,12 @@ public class PairHMMUnitTest extends BaseTest { public Object[][] makeHaplotypeIndexingProvider() { List tests = new ArrayList(); + // First difference (root2, root3) is the base position immediately following first difference (root1, root2) final String root1 = "ACGTGTCAAACCGGGTT"; - final String root2 = "ACGTGTCACACTGGGTT"; // differs in two locations + final String root2 = "ACGTGTCACACTGGGTT"; // differs in two locations from root1 + final String root3 = "ACGTGTCACTCCGCGTT"; // differs in two locations from root2 - final String read1 = "ACGTGTCACACTGGATT"; // 1 diff from 2, 2 diff from root1 + final String read1 = "ACGTGTCACACTGGATT"; // 1 diff from 2, 2 diff from root1, 2 diff from root3 final String read2 = root1; // same as root1 final String read3 = root2; // same as root2 final String read4 = "ACGTGTCACACTGGATTCGAT"; @@ -521,7 +545,7 @@ public class PairHMMUnitTest extends BaseTest { // int readLength = read.length(); { for ( int readLength = 10; readLength < read.length(); readLength++ ) { final String myRead = read.substring(0, readLength); - tests.add(new Object[]{hmm, root1, root2, myRead}); + tests.add(new Object[]{hmm, root1, root2, root3, myRead}); } } } @@ -530,7 +554,7 @@ public class PairHMMUnitTest extends BaseTest { } @Test(enabled = !DEBUG, dataProvider = "HaplotypeIndexingProvider") - void testHaplotypeIndexing(final PairHMM hmm, final String root1, final String root2, final String read) { + void testHaplotypeIndexing(final PairHMM hmm, final String root1, final String root2, final String root3, final String read) { final double TOLERANCE = 1e-9; final String prefix = "AACCGGTTTTTGGGCCCAAACGTACGTACAGTTGGTCAACATCGATCAGGTTCCGGAGTAC"; @@ -544,24 +568,30 @@ public class PairHMMUnitTest extends BaseTest { final String myPrefix = prefix.substring(prefixStart, prefix.length()); final String hap1 = myPrefix + root1; final String hap2 = myPrefix + root2; + final String hap3 = myPrefix + root3; final int hapStart = PairHMM.findFirstPositionWhereHaplotypesDiffer(hap1.getBytes(), hap2.getBytes()); - final double actual1 = testHaplotypeIndexingCalc(hmm, hap1, read, 0, true); - final double actual2 = testHaplotypeIndexingCalc(hmm, hap2, read, hapStart, false); - final double expected2 = testHaplotypeIndexingCalc(hmm, hap2, read, 0, true); - Assert.assertEquals(actual2, expected2, TOLERANCE, "Caching calculation failed for read " + read + " against haplotype with prefix '" + myPrefix + // Run the HMM on the first haplotype, peaking ahead the second, to set up caching + // Then run on the second haplotype in both cached and uncached mode, and verify that results are the same + // When evaluating actual2, it is important that we both apply old caching from hap1 and set up new caching for hap3, to ensure read/write operations do not cause conflicts + final double actual1 = testHaplotypeIndexingCalc(hmm, hap1, hap2, read, 0, true); + final double actual2 = testHaplotypeIndexingCalc(hmm, hap2, hap3, read, hapStart, false); + final double expected2 = testHaplotypeIndexingCalc(hmm, hap2, null, read, 0, true); + Assert.assertEquals(actual2, expected2, TOLERANCE, "HMM " + hmm.getClass() + " Caching calculation failed for read " + read + " against haplotype with prefix '" + myPrefix + "' expected " + expected2 + " but got " + actual2 + " with hapStart of " + hapStart); } } - private double testHaplotypeIndexingCalc(final PairHMM hmm, final String hap, final String read, final int hapStart, final boolean recache) { + private double testHaplotypeIndexingCalc(final PairHMM hmm, final String hap, final String nextHap, final String read, final int hapStart, final boolean recache) { final byte[] readBases = read.getBytes(); + // if not peaking ahead to capture info for a future cache run, the next haplotype will be null, and this should be passed to HMM + final byte[] nextHapBases = nextHap == null ? null : nextHap.getBytes(); final byte[] baseQuals = Utils.dupBytes((byte)30, readBases.length); final byte[] insQuals = Utils.dupBytes((byte)45, readBases.length); final byte[] delQuals = Utils.dupBytes((byte)40, readBases.length); final byte[] gcp = Utils.dupBytes((byte)10, readBases.length); - double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp, recache); + double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp, recache, nextHapBases); Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d + " was bad for read " + read + " and ref " + hap + " with hapStart " + hapStart); return d; } @@ -576,7 +606,6 @@ public class PairHMMUnitTest extends BaseTest { for ( final boolean oneIsDiff : Arrays.asList(true, false) ) { final byte[] hap1 = Utils.dupBytes((byte)'A', haplotypeSize1); final byte[] hap2 = Utils.dupBytes((byte)'A', haplotypeSize2); - final int expected = oneIsDiff ? makeDiff(hap1, differingSite, minLength) : makeDiff(hap2, differingSite, minLength); @@ -604,6 +633,10 @@ public class PairHMMUnitTest extends BaseTest { myLoglessPairHMM.doNotUseTristateCorrection(); tests.add(new Object[]{myLoglessPairHMM}); + final ArrayLoglessPairHMM myArrayLoglessPairHMM = new ArrayLoglessPairHMM(); + myArrayLoglessPairHMM.doNotUseTristateCorrection(); + tests.add(new Object[]{myArrayLoglessPairHMM}); + final Log10PairHMM myLog10PairHMM = new Log10PairHMM(true); myLog10PairHMM.doNotUseTristateCorrection(); tests.add(new Object[]{myLog10PairHMM}); @@ -619,7 +652,7 @@ public class PairHMMUnitTest extends BaseTest { // didn't call initialize => should exception out double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - baseQuals, baseQuals, baseQuals, baseQuals, true); + baseQuals, baseQuals, baseQuals, baseQuals, true, null); } @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider") @@ -630,7 +663,7 @@ public class PairHMMUnitTest extends BaseTest { hmm.initialize(3, 3); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - baseQuals, baseQuals, baseQuals, baseQuals, true); + baseQuals, baseQuals, baseQuals, baseQuals, true, null); } @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider") @@ -641,6 +674,6 @@ public class PairHMMUnitTest extends BaseTest { hmm.initialize(2, 3); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - baseQuals, baseQuals, baseQuals, baseQuals, true); + baseQuals, baseQuals, baseQuals, baseQuals, true, null); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java index a75c9426c..e7bc5cb56 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -101,7 +101,8 @@ public final class Log10PairHMM extends N2MemoryPairHMM { final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final boolean recacheReadValues ) { + final boolean recacheReadValues, + final int nextHapStartIndex) { if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { // set the initial value (free deletions in the beginning) for the first row in the deletion matrix diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java index 1b277d3d8..a091a0716 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java @@ -44,10 +44,6 @@ abstract class N2MemoryPairHMM extends PairHMM { protected double[][] insertionMatrix = null; protected double[][] deletionMatrix = null; - // only used for debugging purposes - protected boolean doNotUseTristateCorrection = false; - protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; } - /** * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths * diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index f4f70ac63..eb52f4a85 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -43,6 +43,7 @@ public abstract class PairHMM { protected boolean constantsAreInitialized = false; protected byte[] previousHaplotypeBases; + protected int hapStartIndex; public enum HMM_IMPLEMENTATION { /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ @@ -51,6 +52,8 @@ public abstract class PairHMM { ORIGINAL, /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ LOGLESS_CACHING, + /* Logless caching PairHMM that stores computations in 1D arrays instead of matrices, and which proceeds diagonally over the (read x haplotype) intersection matrix */ + ARRAY_LOGLESS } protected int maxHaplotypeLength, maxReadLength; @@ -58,6 +61,10 @@ public abstract class PairHMM { protected int paddedReadLength, paddedHaplotypeLength; private boolean initialized = false; + // only used for debugging purposes + protected boolean doNotUseTristateCorrection = false; + protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; } + /** * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths * @@ -109,7 +116,8 @@ public abstract class PairHMM { final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final boolean recacheReadValues ) { + final boolean recacheReadValues, + final byte[] nextHaploytpeBases) { if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); @@ -123,9 +131,13 @@ public abstract class PairHMM { paddedReadLength = readBases.length + 1; paddedHaplotypeLength = haplotypeBases.length + 1; - final int hapStartIndex = (previousHaplotypeBases == null || haplotypeBases.length != previousHaplotypeBases.length || recacheReadValues) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, previousHaplotypeBases); + hapStartIndex = (recacheReadValues) ? 0 : hapStartIndex; - double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues); + // Pre-compute the difference between the current haplotype and the next one to be run + // Looking ahead is necessary for the ArrayLoglessPairHMM implementation + final int nextHapStartIndex = (nextHaploytpeBases == null || haplotypeBases.length != nextHaploytpeBases.length) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, nextHaploytpeBases); + + double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues, nextHapStartIndex); if ( ! MathUtils.goodLog10Probability(result) ) throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); @@ -134,6 +146,10 @@ public abstract class PairHMM { // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. previousHaplotypeBases = haplotypeBases; + // For the next iteration, the hapStartIndex for the next haploytpe becomes the index for the current haplotype + // The array implementation has to look ahead to the next haplotype to store caching info. It cannot do this if nextHapStart is before hapStart + hapStartIndex = (nextHapStartIndex < hapStartIndex) ? 0: nextHapStartIndex; + return result; } @@ -149,7 +165,8 @@ public abstract class PairHMM { final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final boolean recacheReadValues ); + final boolean recacheReadValues, + final int nextHapStartIndex); /** * Compute the first position at which two haplotypes differ From 86fe9fae76e81b13fb801223aad37d7451fd7d1c Mon Sep 17 00:00:00 2001 From: bradtaylor Date: Wed, 7 Aug 2013 18:27:30 -0400 Subject: [PATCH 03/77] Changes to Array PairHMM to address review comments Returned Logless Caching implementation to the default in all cases. Changing to the array version will await performance benchmarking Refactored many pieces of functionality in ArrayLoglessPairHMM into their own methods. --- .../genotyper/UnifiedArgumentCollection.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 2 +- .../utils/pairhmm/ArrayLoglessPairHMM.java | 278 ++++++++++++------ 3 files changed, 186 insertions(+), 96 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index ff6bc5407..4fae3d6e3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -85,7 +85,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection * The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. */ @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false) - public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ARRAY_LOGLESS; + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 7edf55fed..0b95ed07e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -387,7 +387,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Hidden @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) - public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ARRAY_LOGLESS; + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; @Hidden @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java index 26eb745bd..4b996e770 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java @@ -130,6 +130,7 @@ public class ArrayLoglessPairHMM extends PairHMM { nextMatchCacheArray = new double[paddedMaxReadLength]; nextDeleteCacheArray = new double[paddedMaxReadLength]; nextInsertCacheArray = new double [paddedMaxReadLength]; + } @@ -163,59 +164,27 @@ public class ArrayLoglessPairHMM extends PairHMM { // note that we initialized the constants constantsAreInitialized = true; - // Pad the ends of the Match and Insert arrays with 0. Analogous to the first row in the Match, Insert matrices of N2MemoryPairHMM - grandparentMatchArray[readBases.length] = 0; - grandparentInsertArray[readBases.length] = 0; - parentMatchArray[readBases.length] = 0; - parentInsertArray[readBases.length] = 0; - currentMatchArray[readBases.length] = 0; - currentInsertArray[readBases.length] = 0; - matchCacheArray[readBases.length] = 0; - insertCacheArray[readBases.length] = 0; - nextMatchCacheArray[readBases.length] = 0; - nextInsertCacheArray[readBases.length] = 0; + // Read length may have changed, so we need to set zero-value padding at the appropriate position. + padMatchAndInsertArrays(readBases.length); } - // if we have not cached from prev haplotype, clear any info we may have accumulated in a previous HMM iteration + + // if we have not cached from a previous haplotype, clear any info we may have accumulated in a previous HMM iteration if (hapStartIndex == 0) { - Arrays.fill(matchCacheArray, 0, readBases.length, 0); - Arrays.fill(deleteCacheArray, 0, readBases.length, 0); - Arrays.fill(insertCacheArray, 0, readBases.length, 0); + clearPreviouslyCachedInfo(readBases.length); - partialSum = 0; - - // Padding value for the deletion arrays. Let's us have free deletions at the beginning - // Needs to be reset when starting a new read or when hap length changes (ie when hapStartIndex is 0) - final double initialValue = INITIAL_CONDITION / haplotypeBases.length; - // Pad the deletion arrays. Akin to padding the first row in the deletion matrix - parentDeleteArray[readBases.length] = initialValue; - grandparentDeleteArray[readBases.length] = initialValue; - currentDeleteArray[readBases.length] = initialValue; - deleteCacheArray[readBases.length] = initialValue; - nextDeleteCacheArray[readBases.length] = initialValue; + // Haplotype length may have changed, so we need to set initial-value padding at the appropriate position. + padDeleteArrays(haplotypeBases.length, readBases.length); } - // We build up our solution by looking at position [0] in the match, insert arrays. Need to set to 0 before we start. - grandparentMatchArray[0] = 0; - grandparentInsertArray[0] = 0; - parentMatchArray[0] = 0; - parentInsertArray[0] = 0; - currentMatchArray[0] = 0; - currentInsertArray[0] = 0; - // Array implementation. Start by initializing some array parameters - // Number of diagonals for a matrix = rows + cols - 1; - final int maxDiagonals = readBases.length + haplotypeBases.length - hapStartIndex - 1; - // The array indices we want to fill will be between these values - int startFill; - int endFill; - // The position of the arrays to be updated - int arrayIndex; - // The coordinate in our priors and transition matrices corresponding to a given position in the read/haplotype alignment - int matrixRow; - int matrixCol; - // The final answer prior to log10 correction - double finalArraySumProbabilities = partialSum; - // This array will contain the partial sum to cache for the next haplotype - final int cacheSumIndex = nextHapStartIndex - hapStartIndex + readBases.length - 1; + // We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. + clearArraySolutionPosition(); + + // Some parameters to control behavior during the dynamic programming loop + final int maxDiagonals = readBases.length + haplotypeBases.length - hapStartIndex - 1; // Number of diagonals for a matrix = rows + cols - 1; + int startFill; // The lower bound of the array indices we want to over-write + int endFill; // The upper bound of the array indices we want to over-write + final int cacheSumIndex = nextHapStartIndex - hapStartIndex + readBases.length - 1; // This array will contain the partial sum to cache for the next haplotype + double finalArraySumProbabilities = partialSum; // The final answer prior to log10 correction // Perform dynamic programming using arrays, as if over diagonals of a hypothetical read/haplotype alignment matrix for (int i = 1; i <= maxDiagonals; i++) { @@ -224,34 +193,11 @@ public class ArrayLoglessPairHMM extends PairHMM { endFill = Math.min(maxDiagonals - i + 1, readBases.length); // apply any previously cached array information - if (i <= readBases.length) { - // apply caching info necessary for calculating current DELETE array values - parentMatchArray[startFill] = matchCacheArray[startFill]; - parentDeleteArray[startFill] = deleteCacheArray[startFill]; - // apply caching info necessary for calculating current MATCH array values - grandparentMatchArray[startFill + 1] = matchCacheArray[startFill + 1]; - grandparentDeleteArray[startFill + 1] = deleteCacheArray[startFill + 1]; - grandparentInsertArray[startFill + 1] = insertCacheArray[startFill + 1]; - } + if (i <= readBases.length) + applyPreviouslyCachedInfo(startFill); - // fill in the cells for our arrays - for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) { - - // translate the array position into a row, column in the priors and transition matrices - matrixRow = readBases.length - arrayIndex - 1; - matrixCol = i - matrixRow - 1 + hapStartIndex; - - // update cell for each of our new arrays. Prior, transition matrices are padded +1 row,col - updateArrayCell(arrayIndex, prior[matrixRow+1][matrixCol+1], transition[matrixRow+1]); - - // Set up caching for the next haplotype - // At the position of the final similar base between this haplotype and the next one, remember the mid-array values - if (matrixCol == nextHapStartIndex - 1) { - nextMatchCacheArray[arrayIndex] = currentMatchArray[arrayIndex]; - nextDeleteCacheArray[arrayIndex] = currentDeleteArray[arrayIndex]; - nextInsertCacheArray[arrayIndex] = currentInsertArray[arrayIndex]; - } - } + // fill in the cells for our current arrays + updateArrays(readBases.length, hapStartIndex, nextHapStartIndex, startFill, endFill, i); // final probability is the log10 sum of the last element in the Match and Insertion state arrays // this way we ignore all paths that ended in deletions! (huge) @@ -265,27 +211,10 @@ public class ArrayLoglessPairHMM extends PairHMM { if (i == cacheSumIndex) partialSum = finalArraySumProbabilities; - // rotate array references - double[] tempMatchArray = grandparentMatchArray; - double[] tempDeleteArray = grandparentDeleteArray; - double[] tempInsertArray = grandparentInsertArray; - - grandparentMatchArray = parentMatchArray; - grandparentDeleteArray = parentDeleteArray; - grandparentInsertArray = parentInsertArray; - - parentMatchArray = currentMatchArray; - parentDeleteArray = currentDeleteArray; - parentInsertArray = currentInsertArray; - - currentMatchArray = tempMatchArray; - currentDeleteArray = tempDeleteArray; - currentInsertArray = tempInsertArray; + rotateArrayReferences(); } // The cache arrays we wrote for this haplotype will be read for the next haplotype. - matchCacheArray = nextMatchCacheArray.clone(); - deleteCacheArray = nextDeleteCacheArray.clone(); - insertCacheArray = nextInsertCacheArray.clone(); + rotateCacheArrays(); //return result return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10; @@ -341,6 +270,136 @@ public class ArrayLoglessPairHMM extends PairHMM { } } + /** + * Pad the ends of the Match and Insert arrays with 0. + * Analogous to setting zeros in the first row in the Match, Insert matrices of N2MemoryPairHMM. + * + * @param padPosition Which index in the arrays we wish to pad + */ + private void padMatchAndInsertArrays(final int padPosition) { + grandparentMatchArray[padPosition] = 0; + grandparentInsertArray[padPosition] = 0; + parentMatchArray[padPosition] = 0; + parentInsertArray[padPosition] = 0; + currentMatchArray[padPosition] = 0; + currentInsertArray[padPosition] = 0; + matchCacheArray[padPosition] = 0; + insertCacheArray[padPosition] = 0; + nextMatchCacheArray[padPosition] = 0; + nextInsertCacheArray[padPosition] = 0; + } + + /** + * Pad the Delete arrays with an intial value. Let's us have free deletions at the beginning of the alignment. + * Analogous to padding the first row of the Delete matrix of N2MemoryPairHMM. + * + * @param haplotypeLength The length of the present haplotype. Necessary for calculating initial padding value + * @param padPosition Which index in the arrays we wish to pad + */ + private void padDeleteArrays(final int haplotypeLength, final int padPosition) { + final double initialValue = INITIAL_CONDITION / haplotypeLength; + + // Pad the deletion arrays. Akin to padding the first row in the deletion matrix + parentDeleteArray[padPosition] = initialValue; + grandparentDeleteArray[padPosition] = initialValue; + currentDeleteArray[padPosition] = initialValue; + deleteCacheArray[padPosition] = initialValue; + nextDeleteCacheArray[padPosition] = initialValue; + } + + /** + * We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. + * + */ + private void clearArraySolutionPosition() { + grandparentMatchArray[0] = 0; + grandparentInsertArray[0] = 0; + parentMatchArray[0] = 0; + parentInsertArray[0] = 0; + currentMatchArray[0] = 0; + currentInsertArray[0] = 0; + } + + /** + * Clears cached information saved from the last haplotype, + * allowing us to start at the beginning of the present haplotype with intitial values of 0. + * + * @param fillLength How much of the cache arrays do we need to zero + */ + private void clearPreviouslyCachedInfo(final int fillLength) { + Arrays.fill(matchCacheArray, 0, fillLength, 0); + Arrays.fill(deleteCacheArray, 0, fillLength, 0); + Arrays.fill(insertCacheArray, 0, fillLength, 0); + + partialSum = 0; + } + + /** + * Applies cached information saved from the last haplotype, + * allowing us to start in the middle of the present haplotype. + * + * @param indK the index in the arrays we wish to update with cached info + */ + private void applyPreviouslyCachedInfo(int indK) { + // apply caching info necessary for calculating current DELETE array values + parentMatchArray[indK] = matchCacheArray[indK]; + parentDeleteArray[indK] = deleteCacheArray[indK]; + + // apply caching info necessary for calculating current MATCH array values + grandparentMatchArray[indK + 1] = matchCacheArray[indK + 1]; + grandparentDeleteArray[indK + 1] = deleteCacheArray[indK + 1]; + grandparentInsertArray[indK + 1] = insertCacheArray[indK + 1]; + } + + /** + * Records the mid-process state of one location in the read/haplotype alignment. + * Writes new cache information for use with the next haplotype we see. + * + * @param indK the index in the cache arrays we wish to store information in + */ + private void recordNewCacheInfo(int indK) { + nextMatchCacheArray[indK] = currentMatchArray[indK]; + nextDeleteCacheArray[indK] = currentDeleteArray[indK]; + nextInsertCacheArray[indK] = currentInsertArray[indK]; + } + + /** + * Update the HMM arrays for the current diagonal. + * + * @param readLength The length of the read + * @param hapStartIndex An offset that tells us if we are starting in the middle of the present haplotype + * @param nextHapStartIndex An offset that tells us which base in the NEXT haplotype we need to look at to record new caching info + * @param startFill The lower bound of the array indices we want to over-write + * @param endFill The upper bound of the array indices we want to over-write + * @param iii The index indicating which diagonal of the read/haplotype alignment we are working on + */ + private void updateArrays(final int readLength, + final int hapStartIndex, + final int nextHapStartIndex, + final int startFill, + final int endFill, + final int iii) { + + // The coordinate in our priors and transition matrices corresponding to a given position in the read/haplotype alignment + int matrixRow; + int matrixCol; + + int arrayIndex; + for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) { + // translate the array position into a row, column in the priors and transition matrices + matrixRow = readLength - arrayIndex - 1; + matrixCol = iii - matrixRow - 1 + hapStartIndex; + + // update cell for each of our current arrays. Prior, transition matrices are padded +1 row,col + updateArrayCell(arrayIndex, prior[matrixRow+1][matrixCol+1], transition[matrixRow+1]); + + // Set up caching for the next haplotype + // At the position of the final similar base between this haplotype and the next one, remember the mid-array values + if (matrixCol == nextHapStartIndex - 1) + recordNewCacheInfo(arrayIndex); + } + } + /** * Updates a cell in the HMM arrays * @@ -356,4 +415,35 @@ public class ArrayLoglessPairHMM extends PairHMM { currentDeleteArray[indK] = parentMatchArray[indK] * transition[matchToDeletion] + parentDeleteArray[indK] * transition[deletionToDeletion]; } + /** + * To prepare for the next diagonal in our loop, each array must be bumped to an older generation + * + */ + private void rotateArrayReferences() { + double[] tempMatchArray = grandparentMatchArray; + double[] tempDeleteArray = grandparentDeleteArray; + double[] tempInsertArray = grandparentInsertArray; + + grandparentMatchArray = parentMatchArray; + grandparentDeleteArray = parentDeleteArray; + grandparentInsertArray = parentInsertArray; + + parentMatchArray = currentMatchArray; + parentDeleteArray = currentDeleteArray; + parentInsertArray = currentInsertArray; + + currentMatchArray = tempMatchArray; + currentDeleteArray = tempDeleteArray; + currentInsertArray = tempInsertArray; + } + + /** + * To prepare for the next haplotype, the caching info we wrote is copied into the cach-read arrays + * + */ + private void rotateCacheArrays() { + matchCacheArray = nextMatchCacheArray.clone(); + deleteCacheArray = nextDeleteCacheArray.clone(); + insertCacheArray = nextInsertCacheArray.clone(); + } } From 0435bbe38fda417ba6a88b99bc0c5b515bcaca1e Mon Sep 17 00:00:00 2001 From: bradtaylor Date: Sun, 18 Aug 2013 21:24:58 -0400 Subject: [PATCH 04/77] Retreived PairHMM benchmarks from archive and made improvements PairHMMSyntheticBenchmark and PairHMMEmpirical benchmark were written to test the banded pairHMM, and were archived along with it. I returned them to the test directory for use in benchmarking the ArrayLoglessPairHMM. I commented out references to the banded pairHMM (which was left in archive), rather than removing those references entirely. Renamed PairHMMEmpiricalBenchmark to PairHMMBandedEmpiricalBenchmark and returned it to the archive. It has a few problems for use as a general benchmark, including initializing the HMM too frequently and doing too much setup work in the 'time' method. However, since the size selection and debug printing are useful for testing the banded implementation, I decided to keep it as-is and archive it alongside with the other banded pairHMM classes. I did fix one bug that was causing the selectWorkingData function to return prematurely. As a result, the benchmark was only evaluating 4-40 pairHMM calls instead of the desired "maxRecords". I wrote a new PairHMMEmpiricalBenchmark that simply works through a list of data, with setup work and hmm-initialization moved to its own function. This involved writing a new data read-in function in PairHMMTestData. The original was not maintaining the input data in order, the end result of which would be an over-estimate of how much caching we are able to do. The new benchmark class more closely mirrors real-world operation over large data. It might be cleaner to fix some of the issues with the BandedEmpiricalBenchmark and use one read-in function. However, this would involve more extensive changes to: PairHMMBandedEmpiricalBenchmark PairHMMTestData BandedLoglessPairHMMUnitTest I decided against this as the banded benchmark and unit test are archived. --- .../pairhmm/PairHMMEmpiricalBenchmark.java | 115 ++++++++++++++++ .../pairhmm/PairHMMSyntheticBenchmark.java | 127 ++++++++++++++++++ .../sting/utils/pairhmm/PairHMMTestData.java | 84 ++++++++++-- 3 files changed, 318 insertions(+), 8 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java create mode 100644 protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java rename protected/java/{src => test}/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java (80%) diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java new file mode 100644 index 000000000..151097aad --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java @@ -0,0 +1,115 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; + +import java.io.File; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.lang.Math; + +/** + * Caliper microbenchmark for empirical test data for PairHMM + */ +public class PairHMMEmpiricalBenchmark extends SimpleBenchmark { + @Param ({"array_logless", "logless"}) + String algorithm; + + @Param({"likelihoods_NA12878_HiSeqWGS_chr20_1mb.txt"}) + String likelihoodsFile; + + @Param({"1000","10000","70000"}) + int records; + + PairHMM hmm =null; + + List empiricalData = new LinkedList<>(); + List workingData = new LinkedList<>(); + + + @Override + protected void setUp() throws Exception { + empiricalData = PairHMMTestData.readLikelihoodsInOrder(new File(likelihoodsFile)); + records = Math.min(records, empiricalData.size()); + workingData = empiricalData.subList(0,records); + + int maxReadLength = PairHMMTestData.calcMaxReadLen(workingData); + int maxHaplotypeLength = PairHMMTestData.calcMaxHaplotypeLen(workingData); + + hmm = getHmm(); + hmm.initialize(maxReadLength,maxHaplotypeLength); + } + + private PairHMM getHmm() { + switch (algorithm) { + case "logless": return new LoglessPairHMM(); + case "array_logless": return new ArrayLoglessPairHMM(); + default: throw new IllegalStateException("Unexpected algorithm " + algorithm); + } + } + + public double timeHMM(int rep){ + double result = 0; + for (int i = 0; i < rep; i++) { + for (final PairHMMTestData datum : workingData){ + result += hmm.computeReadLikelihoodGivenHaplotypeLog10(datum.ref.getBytes(), + datum.getRead().getBytes(), + datum.baseQuals, + datum.insQuals, + datum.delQuals, + datum.gcp, + datum.newRead, + datum.nextRef.getBytes()); + + } + } + return result; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java new file mode 100644 index 000000000..9706c0e9d --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java @@ -0,0 +1,127 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; + +import java.util.*; + +/** + * Caliper microbenchmark for synthetic test data for PairHMM + */ +public class PairHMMSyntheticBenchmark extends SimpleBenchmark { + @Param ({"array_logless", "logless"}) +// @Param({"logless", "array_logless"}) +// @Param({"logless", "banded_w5_mle10", "banded_w5_mle20"}) +// @Param({"logless", "banded_w10_mle20", "banded_w5_mle20", "banded_w5_mle10"}) + String algorithm; + +// @Param({"40", "100", "200", "300", "500"}) + @Param({"40", "300"}) +// @Param({"300"}) + int refLength; + +// @Param({"200"}) + @Param({"40", "101", "200"}) +// @Param({"40", "100", "200", "300", "500"}) + int readLength; + + private PairHMM hmm; + private PairHMMTestData testData; + + + @Override + protected void setUp() throws Exception { + hmm = getHmm(); + final String ref = generateSeq(refLength); + final String nextRef = generateSeq(refLength); + final String read = generateSeq(readLength); + testData = new PairHMMTestData(ref, nextRef, read, (byte)30); + System.out.println(testData.toString()); + } + + private PairHMM getHmm() { + switch (algorithm) { + case "logless": return new LoglessPairHMM(); + case "array_logless": return new ArrayLoglessPairHMM(); +// case "banded_w10_mle20": return new BandedLoglessPairHMM(10, 1e-20); +// case "banded_w5_mle20": return new BandedLoglessPairHMM(5, 1e-20); +// case "banded_w5_mle10": return new BandedLoglessPairHMM(5, 1e-10); + default: throw new IllegalStateException("Unexpected algorithm " + algorithm); + } + } + + private String generateSeq(final int len) { + final List root = Arrays.asList("A", "C", "G", "T"); + + String seq = ""; + for ( int i = 0; true; i++ ) { + final String base = root.get(i % root.size()); + final int copies = i / root.size() + 1; + seq += Utils.dupString(base, copies); + if ( seq.length() >= len ) + return seq.substring(0, len); + } + } + + public void timePairHMM(int rep) { + for ( int i = 0; i < rep; i++ ) { + testData.runHMM(hmm); + } +// if ( hmm instanceof BandedLoglessPairHMM ) { +// final BandedLoglessPairHMM banded = (BandedLoglessPairHMM)hmm; +// System.out.printf("Banded n cells possible : %d%n", banded.nCellsOverall); +// System.out.printf("Banded n cells evaluated : %d%n", banded.nCellsEvaluated); +// } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java similarity index 80% rename from protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java rename to protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java index 7fef514d9..e6c8e1e61 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java @@ -71,8 +71,9 @@ public class PairHMMTestData { private final String read; public final byte[] baseQuals, insQuals, delQuals, gcp; public final double log10l; + public final boolean newRead; - PairHMMTestData(String ref, String nextRef, String read, byte[] baseQuals, byte[] insQuals, byte[] delQuals, byte[] gcp, double log10l) { + PairHMMTestData(String ref, String nextRef, String read, byte[] baseQuals, byte[] insQuals, byte[] delQuals, byte[] gcp, double log10l, boolean newRead) { this.ref = ref; this.nextRef = nextRef; this.read = read; @@ -81,6 +82,7 @@ public class PairHMMTestData { this.delQuals = delQuals; this.gcp = gcp; this.log10l = log10l; + this.newRead = newRead; } PairHMMTestData(String ref, String nextRef, String read, final byte qual) { @@ -90,6 +92,7 @@ public class PairHMMTestData { this.baseQuals = this.insQuals = this.delQuals = Utils.dupBytes(qual, read.length()); this.gcp = Utils.dupBytes((byte)10, read.length()); this.log10l = -1; + this.newRead = true; } public double runHMM(final PairHMM hmm) { @@ -104,25 +107,28 @@ public class PairHMMTestData { "ref='" + ref + '\'' + ", nextRef=" + nextRef + '\'' + ", read='" + getRead() + '\'' + - ", log10l=" + log10l + + ", log10l=" + log10l + '\'' + + ", newRead=" + newRead + '}'; } - public static void runHMMs(final PairHMM hmm, final List data, final boolean runSingly) { + public static double runHMMs(final PairHMM hmm, final List data, final boolean runSingly) { + double result = 0; if ( runSingly ) { for ( final PairHMMTestData datum : data ) - datum.runHMM(hmm); + result += datum.runHMM(hmm); } else { // running in batch mode final PairHMMTestData first = data.get(0); int maxHaplotypeLen = calcMaxHaplotypeLen(data); hmm.initialize(first.getRead().length(), maxHaplotypeLen); for ( final PairHMMTestData datum : data ) { - hmm.computeReadLikelihoodGivenHaplotypeLog10(datum.ref.getBytes(), datum.getRead().getBytes(), - datum.baseQuals, datum.insQuals, datum.delQuals, datum.gcp, false, datum.nextRef.getBytes()); + result += hmm.computeReadLikelihoodGivenHaplotypeLog10(datum.ref.getBytes(), datum.getRead().getBytes(), + datum.baseQuals, datum.insQuals, datum.delQuals, datum.gcp, datum.newRead, datum.nextRef.getBytes()); } } + return result; } public static int calcMaxHaplotypeLen(final List data) { @@ -132,6 +138,13 @@ public class PairHMMTestData { return maxHaplotypeLen; } + public static int calcMaxReadLen(final List data) { + int maxReadLen = 0; + for ( final PairHMMTestData datum : data ) + maxReadLen = Math.max(maxReadLen, datum.getRead().length()); + return maxReadLen; + } + public static Map> readLikelihoods(final File file) throws IOException { final Map> results = new LinkedHashMap<>(); @@ -153,7 +166,8 @@ public class PairHMMTestData { SAMUtils.fastqToPhred(thisEntry[3]), SAMUtils.fastqToPhred(thisEntry[4]), SAMUtils.fastqToPhred(thisEntry[5]), - Double.parseDouble(thisEntry[6])); + Double.parseDouble(thisEntry[6]), + ! results.containsKey(thisEntry[1])); if ( ! results.containsKey(info.read) ) { results.put(info.read, new LinkedList()); @@ -171,7 +185,8 @@ public class PairHMMTestData { SAMUtils.fastqToPhred(thisEntry[3]), SAMUtils.fastqToPhred(thisEntry[4]), SAMUtils.fastqToPhred(thisEntry[5]), - Double.parseDouble(thisEntry[6])); + Double.parseDouble(thisEntry[6]), + ! results.containsKey(thisEntry[1])); if ( ! results.containsKey(info.read) ) { results.put(info.read, new LinkedList()); @@ -182,6 +197,59 @@ public class PairHMMTestData { return results; } + + /* + * simplified likelihoods file reader that returns a list instead of a map + * + * readLikelihoods() method was reordering inputs, with the result that caching would be more efficient + * This method simply returns a list of read/haplotype pairs in their original order, providing a more realistic caching scenario + */ + public static List readLikelihoodsInOrder(final File file) throws IOException { + final List results = new LinkedList<>(); + + InputStream in = new FileInputStream(file); + if ( file.getName().endsWith(".gz") ) { + in = new GZIPInputStream(in); + } + + String previousRead = null; + String[] nextEntry; + String[] thisEntry = null; + for ( final String line : new XReadLines(in) ) { + // peak at the next entry (to get the haplotype bases) + nextEntry = line.split(" "); + // process the current entry + if (thisEntry != null) { + final PairHMMTestData info = new PairHMMTestData( + thisEntry[0], nextEntry[0], thisEntry[1], + SAMUtils.fastqToPhred(thisEntry[2]), + SAMUtils.fastqToPhred(thisEntry[3]), + SAMUtils.fastqToPhred(thisEntry[4]), + SAMUtils.fastqToPhred(thisEntry[5]), + Double.parseDouble(thisEntry[6]), + !(thisEntry[1].equals(previousRead))); + + results.add(info); + previousRead = info.getRead(); + } + // update the current entry + thisEntry = nextEntry; + } + // process the final entry + final PairHMMTestData info = new PairHMMTestData( + thisEntry[0], null, thisEntry[1], + SAMUtils.fastqToPhred(thisEntry[2]), + SAMUtils.fastqToPhred(thisEntry[3]), + SAMUtils.fastqToPhred(thisEntry[4]), + SAMUtils.fastqToPhred(thisEntry[5]), + Double.parseDouble(thisEntry[6]), + !(thisEntry[1].equals(previousRead))); + + results.add(info); + + return results; + } + public String getRead() { return read; } From 4473b0065e10e66b106ae03243c6e1770055ba46 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Thu, 29 Aug 2013 11:23:47 -0400 Subject: [PATCH 05/77] adding a check for the UNAVAILABLE case of GenotypeType in CountVariants --- .../varianteval/VariantEvalIntegrationTest.java | 17 ++++++++++++++++- .../varianteval/evaluators/CountVariants.java | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index d695f2d13..9b5290dee 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -48,8 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.Test; import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; import java.util.ArrayList; import java.util.Arrays; @@ -376,6 +376,21 @@ public class VariantEvalIntegrationTest extends WalkerTest { executeTestParallel("testEvalTrackWithoutGenotypes",spec); } + @Test + public void testEvalTrackWithoutGenotypesWithSampleFields() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "-eval " + variantEvalTestDataRoot + "noGenotypes.vcf", + "-o %s" + ), + 1, + Arrays.asList("")); //There is no md5 because we only care that this completes without an exception. + executeTest("testEvalTrackWithoutGenotypesWithSampleFields", spec); + + } + @Test public void testMultipleEvalTracksWithoutGenotypes() { String extraArgs = "-T VariantEval -R " + b37KGReference + diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 33a5a9fc9..63c34586e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -197,6 +197,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { break; case MIXED: break; + case UNAVAILABLE: + break; default: throw new ReviewedStingException("BUG: Unexpected genotype type: " + g); } From ea0deb1bb23951966a9b9b33ce741c32aa05a390 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 30 Aug 2013 12:18:19 -0400 Subject: [PATCH 06/77] Changed the error for the record size mismatch in the genotyping engine to be a user error since it is possible to reach this state with input VCFs that contain the same event multiple times (and it's not something we want to handle in the code). --- .../sting/gatk/walkers/haplotypecaller/GenotypingEngine.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 82029b872..cb3b9b65f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -58,6 +58,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.EventMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; @@ -186,7 +187,8 @@ public class GenotypingEngine { if( mergedVC == null ) { continue; } if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { - throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles."); + // this is possible in GGA mode when the same event is represented in multiple input records + throw new UserException("The same event (although possibly represented differently) is present in multiple input records at location " + loc + " and this is not something we can handle at this time. You will need to remove one of the records in order to proceed with your input file(s)."); } final Map mergeMap = new LinkedHashMap<>(); mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele From b6c3ed0295395cc07dda28a02c05a619ae1983d8 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 6 Sep 2013 09:30:01 -0400 Subject: [PATCH 07/77] Added REVERT SOFTCLIPPED bases to ClipReads --- .../sting/gatk/walkers/readutils/ClipReads.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java index 879022299..dfc36954b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java @@ -305,7 +305,7 @@ public class ClipReads extends ReadWalker Date: Thu, 8 Aug 2013 15:09:38 -0400 Subject: [PATCH 08/77] Created a single sample calling pipeline which leverages the reference model calculation mode of the HaplotypeCaller -- Adding changes to CombineVariants to work with the Reference Model mode of the HaplotypeCaller. -- Added -combineAnnotations mode to CombineVariants to merge the info field annotations by taking the median -- Added new StrandBiasBySample genotype annotation for use in computing strand bias from single sample input vcfs -- Bug fixes to calcGenotypeLikelihoodsOfRefVsAny, used in isActive() as well as the reference model -- Added active region trimming capabilities to the reference model mode, not perfect yet, turn off with --dontTrimActiveRegions -- We only realign reads in the reference model if there are non-reference haplotypes, a big time savings -- We only realign reads in the reference model if the read is informative for a particular haplotype over another -- GVCF blocks will now track and output the minimum PLs over the block -- MD5 changes! -- HC tests: from bug fixes in calcGenotypeLikelihoodsOfRefVsAny -- GVCF tests: from HC changes above and adding in active region trimming --- .../annotator/DepthPerAlleleBySample.java | 12 +- .../gatk/walkers/annotator/FisherStrand.java | 96 ++++++- .../walkers/annotator/InbreedingCoeff.java | 18 +- .../walkers/annotator/StrandBiasBySample.java | 100 +++++++ .../genotyper/ConsensusAlleleCounter.java | 7 +- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- ...elGenotypeLikelihoodsCalculationModel.java | 8 +- .../haplotypecaller/ActiveRegionTrimmer.java | 10 +- .../haplotypecaller/GenotypingEngine.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 25 +- .../haplotypecaller/RefVsAnyResult.java | 10 +- .../ReferenceConfidenceModel.java | 93 ++++-- .../sting/utils/gvcf/GVCFWriter.java | 1 + .../sting/utils/gvcf/HomRefBlock.java | 23 +- .../CalledHaplotypeBAMWriter.java | 1 - .../HaplotypeBAMWriter.java | 28 +- .../VariantAnnotatorIntegrationTest.java | 55 ++++ .../IndelGenotypeLikelihoodsUnitTest.java | 2 +- .../HaplotypeCallerGVCFIntegrationTest.java | 10 +- .../HaplotypeCallerIntegrationTest.java | 12 +- ...aplotypeCallerParallelIntegrationTest.java | 4 +- .../ReferenceConfidenceModelUnitTest.java | 7 +- .../CombineVariantsIntegrationTest.java | 19 +- .../sting/utils/gvcf/GVCFWriterUnitTest.java | 14 +- .../sting/utils/gvcf/HomRefBlockUnitTest.java | 22 +- .../traversals/TraverseActiveRegions.java | 2 +- .../walkers/annotator/VariantAnnotator.java | 24 +- .../annotator/VariantAnnotatorEngine.java | 56 ++-- .../walkers/variantutils/CombineVariants.java | 48 ++-- .../broadinstitute/sting/utils/MathUtils.java | 20 +- .../sting/utils/pileup/PileupElement.java | 2 +- .../utils/smithwaterman/SmithWaterman.java | 2 +- .../variant/GATKVariantContextUtils.java | 267 ++++++++++++------ .../sting/utils/MathUtilsUnitTest.java | 10 +- .../GATKVariantContextUtilsUnitTest.java | 46 ++- .../variant/VariantContextBenchmark.java | 10 +- 36 files changed, 781 insertions(+), 287 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index b22ea7931..0da865a85 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -112,18 +112,18 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { - HashMap alleleCounts = new HashMap(); - for ( Allele allele : vc.getAlleles() ) + final HashMap alleleCounts = new HashMap<>(); + for ( final Allele allele : vc.getAlleles() ) alleleCounts.put(allele.getBases()[0], 0); - ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - for ( PileupElement p : pileup ) { + final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); + for ( final PileupElement p : pileup ) { if ( alleleCounts.containsKey(p.getBase()) ) alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount()); } // we need to add counts in the correct order - int[] counts = new int[alleleCounts.size()]; + final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); @@ -141,7 +141,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa final HashMap alleleCounts = new HashMap<>(); for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } - for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); if (! a.isInformative() ) continue; // read is non-informative final GATKSAMRecord read = el.getKey(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index cdbd43a7a..95be967a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -58,6 +58,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -97,6 +99,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if ( !vc.isVariant() ) return null; + if ( vc.hasGenotypes() ) { + final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() ); + if ( tableFromPerSampleAnnotations != null ) { + return pValueForBestTable(tableFromPerSampleAnnotations, null); + } + } + if (vc.isSNP() && stratifiedContexts != null) { final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); @@ -117,6 +126,32 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return null; } + /** + * Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together + * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation + * @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation + */ + private int[][] getTableFromSamples( final GenotypesContext genotypes ) { + if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); } + + final int[] sbArray = {0,0,0,0}; // forward-reverse -by- alternate-reference + boolean foundData = false; + + for( final Genotype g : genotypes ) { + if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) + continue; + + foundData = true; + final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME); + final int[] data = encodeSBBS(sbbsString); + for( int index = 0; index < sbArray.length; index++ ) { + sbArray[index] += data[index]; + } + } + + return ( foundData ? decodeSBBS(sbArray) : null ); + } + /** * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2 * @@ -148,12 +183,56 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat } public List getKeyNames() { - return Arrays.asList(FS); + return Collections.singletonList(FS); } public List getDescriptions() { - return Arrays.asList( - new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); + return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); + } + + /** + * Helper function to turn the FisherStrand table into the SB annotation array + * @param table the table used by the FisherStrand annotation + * @return the array used by the per-sample Strand Bias annotation + */ + public static int[] getContingencyArray( final int[][] table ) { + if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + final int[] array = new int[4]; // TODO - if we ever want to do something clever with multi-allelic sites this will need to change + array[0] = table[0][0]; + array[1] = table[0][1]; + array[2] = table[1][0]; + array[3] = table[1][1]; + return array; + } + + /** + * Helper function to parse the genotype annotation into the SB annotation array + * @param string the string that is returned by genotype.getAnnotation("SB") + * @return the array used by the per-sample Strand Bias annotation + */ + private static int[] encodeSBBS( final String string ) { + final int[] array = new int[4]; + final StringTokenizer tokenizer = new StringTokenizer(string, ",", false); + for( int index = 0; index < 4; index++ ) { + array[index] = Integer.parseInt(tokenizer.nextToken()); + } + return array; + } + + /** + * Helper function to turn the SB annotation array into the FisherStrand table + * @param array the array used by the per-sample Strand Bias annotation + * @return the table used by the FisherStrand annotation + */ + private static int[][] decodeSBBS( final int[] array ) { + if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); } + final int[][] table = new int[2][2]; + table[0][0] = array[0]; + table[0][1] = array[1]; + table[1][0] = array[2]; + table[1][1] = array[3]; + return table; } private Double pValueForContingencyTable(int[][] originalTable) { @@ -284,13 +363,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + public static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); } + if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); } + final Allele ref = vc.getReference(); final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); - int[][] table = new int[2][2]; + final int[][] table = new int[2][2]; - for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { - for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index bdf37df71..da2143ec1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -76,8 +76,10 @@ import java.util.*; public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final int MIN_SAMPLES = 10; + private static final String INBREEDING_COEFFICIENT_KEY_NAME = "InbreedingCoeff"; private Set founderIds; + @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, @@ -92,15 +94,15 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); - if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) + if (genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) return null; int idxAA = 0, idxAB = 1, idxBB = 2; if (!vc.isBiallelic()) { // for non-bliallelic case, do test with most common alt allele. - // Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB. - int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount()); + // Get then corresponding indices in GL vectors to retrieve GL of AA,AB and BB. + final int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount()); idxAA = idxVector[0]; idxAB = idxVector[1]; idxBB = idxVector[2]; @@ -132,12 +134,12 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno final double q = 1.0 - p; // expected alternative allele frequency final double F = 1.0 - ( hetCount / ( 2.0 * p * q * (double)N ) ); // inbreeding coefficient - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.4f", F)); - return map; + return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.4f", F)); } - public List getKeyNames() { return Arrays.asList("InbreedingCoeff"); } + @Override + public List getKeyNames() { return Collections.singletonList(INBREEDING_COEFFICIENT_KEY_NAME); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("InbreedingCoeff", 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); } + @Override + public List getDescriptions() { return Collections.singletonList(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY_NAME, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java new file mode 100644 index 000000000..fde344e9f --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java @@ -0,0 +1,100 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; + +import java.util.*; + +/** + * Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias + * User: rpoplin + * Date: 8/28/13 + */ + +public class StrandBiasBySample extends GenotypeAnnotation implements ExperimentalAnnotation { + + public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB"; + + @Override + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) + return; + + if (alleleLikelihoodMap == null ) + throw new IllegalStateException("StrandBiasBySample can only be used with likelihood based annotations in the HaplotypeCaller"); + + final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc); + + gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table)); + } + + @Override + public List getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); } + + @Override + public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java index ddf47805f..6f16a704f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -76,16 +76,13 @@ public class ConsensusAlleleCounter { private final int minIndelCountForGenotyping; private final boolean doMultiAllelicCalls; private final double minFractionInOneSample; - private final GenomeLocParser locParser; - public ConsensusAlleleCounter(final GenomeLocParser locParser, - final boolean doMultiAllelicCalls, + public ConsensusAlleleCounter(final boolean doMultiAllelicCalls, final int minIndelCountForGenotyping, final double minFractionInOneSample) { this.minIndelCountForGenotyping = minIndelCountForGenotyping; this.doMultiAllelicCalls = doMultiAllelicCalls; this.minFractionInOneSample = minFractionInOneSample; - this.locParser = locParser; } /** @@ -289,7 +286,7 @@ public class ConsensusAlleleCounter { if (vcs.isEmpty()) return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false); return mergedVC.getAlleles(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index 9c4694955..3cee8f2d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -108,7 +108,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener final List allAllelesToUse){ - List alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true); + List alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, UAC,true); if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE) alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 0f3f7739d..4a3231b3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -89,9 +89,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected static List computeConsensusAlleles(final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final GenomeLocParser locParser, final UnifiedArgumentCollection UAC) { - ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); return counter.computeConsensusAlleles(ref, contexts, contextType); } @@ -113,7 +112,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // starting a new site: clear allele list haplotypeMap.clear(); perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods - alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels); + alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, UAC, ignoreSNPAllelesWhenGenotypingIndels); if (alleleList.isEmpty()) return null; } @@ -212,7 +211,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final GenomeLocParser locParser, final UnifiedArgumentCollection UAC, final boolean ignoreSNPAllelesWhenGenotypingIndels) { @@ -244,7 +242,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } else { - alleles = computeConsensusAlleles(ref, contexts, contextType, locParser, UAC); + alleles = computeConsensusAlleles(ref, contexts, contextType, UAC); } return alleles; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java index 063e3b218..4de90b337 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java @@ -101,24 +101,26 @@ class ActiveRegionTrimmer { * * @param region our full active region * @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position + * @param emitReferenceConfidence are we going to estimate the reference confidence with this active region? * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully */ - public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion) { + public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion, final boolean emitReferenceConfidence) { if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region return null; - final List withinActiveRegion = new LinkedList(); - int pad = snpPadding; + final List withinActiveRegion = new LinkedList<>(); + boolean foundNonSnp = false; GenomeLoc trimLoc = null; for ( final VariantContext vc : allVariantsWithinExtendedRegion ) { final GenomeLoc vcLoc = parser.createGenomeLoc(vc); if ( region.getLocation().overlapsP(vcLoc) ) { if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding - pad = nonSnpPadding; + foundNonSnp = true; trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); withinActiveRegion.add(vc); } } + final int pad = ( emitReferenceConfidence || foundNonSnp ? nonSnpPadding : snpPadding ); // we don't actually have anything in the region after removing variants that don't overlap the region's full location if ( trimLoc == null ) return null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index cb3b9b65f..cc89792d5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -183,7 +183,7 @@ public class GenotypingEngine { final List priorityList = makePriorityList(eventsAtThisLoc); // Merge the event to find a common reference representation - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false); if( mergedVC == null ) { continue; } if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 0b95ed07e..8776a5e4b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -290,7 +290,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * B <= X < C * X >= C * - * The default bands give the following GQ blocks: + * The default bands with (1, 10, 20, 30, 40, 50) give the following GQ blocks: * * [0, 0] * (0, 10] @@ -304,7 +304,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Advanced @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false) - protected List GVCFGQBands = Arrays.asList(1, 10, 20, 30, 40, 50); + protected List GVCFGQBands = Arrays.asList(5, 20, 60); /** * This parameter determines the maximum size of an indel considered as potentially segregating in the @@ -541,7 +541,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this? + // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); @@ -553,11 +553,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.CONTAMINATION_FRACTION = 0.0; - simpleUAC.CONTAMINATION_FRACTION_FILE=null; + simpleUAC.CONTAMINATION_FRACTION_FILE = null; simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - if( UAC.CONTAMINATION_FRACTION_FILE !=null) { + if( UAC.CONTAMINATION_FRACTION_FILE != null ) { UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); } @@ -867,17 +867,17 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // Create ReadErrorCorrector object if requested - will be used within assembly engine. ReadErrorCorrector readErrorCorrector = null; if (errorCorrectReads) - readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG,fullReferenceWithPadding); + readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding); try { - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); - if ( ! emitReferenceConfidence() && ! dontTrimActiveRegions ) { + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype, readErrorCorrector ); + if ( ! dontTrimActiveRegions ) { return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc); } else { // we don't want to trim active regions, so go ahead and use the old one return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true); } - } catch ( Exception e ) { + } catch ( final Exception e ) { // Capture any exception that might be thrown, and write out the assembly failure BAM if requested if ( captureAssemblyFailureBAM ) { final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5); @@ -969,7 +969,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In EventMap.buildEventMapsForHaplotypes(haplotypes, fullReferenceWithPadding, paddedReferenceLoc, DEBUG); final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypes); allVariantsWithinFullActiveRegion.addAll(activeAllelesToGenotype); - final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion); + final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion, false); // TODO -- should pass emitReferenceConfidence() if ( trimmedActiveRegion == null ) { // there were no variants found within the active region itself, so just return null @@ -1001,7 +1001,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } } - // trim down the reads and add them to the trimmed active region final List trimmedReads = new ArrayList<>(originalActiveRegion.getReads().size()); for( final GATKSAMRecord read : originalActiveRegion.getReads() ) { @@ -1096,7 +1095,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In activeRegion.addAll(downsampledReads); } - private Set filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + private Set filterNonPassingReads( final ActiveRegion activeRegion ) { final Set readsToRemove = new LinkedHashSet<>(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { @@ -1107,7 +1106,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return readsToRemove; } - private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) { final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1); final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength()); return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java index ee7565282..8fb7afec7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java @@ -68,5 +68,13 @@ final class RefVsAnyResult { /** * @return Get the DP (sum of AD values) */ - public int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; } + protected int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; } + + /** + * Cap the het and hom var likelihood values by the hom ref likelihood. + */ + protected void capByHomRefLikelihood() { + genotypeLikelihoods[1] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[1]); + genotypeLikelihoods[2] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[2]); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java index 98264d4c2..2c123880d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -61,6 +61,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; import org.broadinstitute.variant.vcf.VCFHeaderLine; @@ -81,10 +82,9 @@ import java.util.*; * Time: 12:52 PM */ public class ReferenceConfidenceModel { - public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; - public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site - public final static String INDEL_INFORMATIVE_DEPTH = "CD"; + //public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now + public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele private final GenomeLocParser genomeLocParser; private final Set samples; @@ -94,6 +94,8 @@ public class ReferenceConfidenceModel { private final static boolean WRITE_DEBUGGING_BAM = false; private final SAMFileWriter debuggingWriter; + private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; + /** * Create a new ReferenceConfidenceModel * @@ -124,6 +126,8 @@ public class ReferenceConfidenceModel { } else { debuggingWriter = null; } + + initializeIndelPLCache(); } /** @@ -132,8 +136,9 @@ public class ReferenceConfidenceModel { */ public Set getVCFHeaderLines() { final Set headerLines = new LinkedHashSet<>(); - headerLines.add(new VCFSimpleHeaderLine("ALT", NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); - headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); + // TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles? + headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); + //headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); return headerLines; } @@ -161,7 +166,7 @@ public class ReferenceConfidenceModel { * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence - * under any position is covers (for snps that 1 bp, but for deletion its the entire ref span) + * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) * @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence * contexts as well as calls from variantCalls if any were provided */ @@ -181,7 +186,7 @@ public class ReferenceConfidenceModel { if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); final GenomeLoc refSpan = activeRegion.getLocation(); - final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, refSpan, stratifiedReadMap); + final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap); final byte[] ref = refHaplotype.getBases(); final List results = new ArrayList<>(refSpan.size()); final String sampleName = stratifiedReadMap.keySet().iterator().next(); @@ -201,9 +206,10 @@ public class ReferenceConfidenceModel { final int refOffset = offset + globalRefOffset; final byte refBase = ref[refOffset]; final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null); + homRefCalc.capByHomRefLikelihood(); final Allele refAllele = Allele.create(refBase, true); - final List refSiteAlleles = Arrays.asList(refAllele, NON_REF_SYMBOLIC_ALLELE); + final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles); final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele)); gb.AD(homRefCalc.AD_Ref_Any); @@ -224,7 +230,7 @@ public class ReferenceConfidenceModel { gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF))); gb.PL(leastConfidenceGLs.getAsPLs()); - gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); + //gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); vcb.genotypes(gb.make()); results.add(vcb.make()); @@ -252,14 +258,21 @@ public class ReferenceConfidenceModel { * @return non-null GenotypeLikelihoods given N */ protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) { - // TODO -- optimization -- this could easily be optimized with some caching - final double homRef = 0.0; - final double het = - LOG10_2 * nInformativeReads; - final double homVar = INDEL_ERROR_RATE * nInformativeReads; - return GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads]; + } + + protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway + private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; + private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp + + private void initializeIndelPLCache() { + for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { + final double homRef = 0.0; + final double het = MathUtils.LOG_ONE_HALF * nInformativeReads; + final double homVar = INDEL_ERROR_RATE * nInformativeReads; + indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + } } - private final static double LOG10_2 = Math.log10(2); - private final static double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp /** * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt @@ -274,8 +287,8 @@ public class ReferenceConfidenceModel { final RefVsAnyResult result = new RefVsAnyResult(); for( final PileupElement p : pileup ) { - final byte qual = p.getQual(); - if( p.isDeletion() || qual > minBaseQual) { + final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); + if( p.isDeletion() || qual > minBaseQual ) { int AA = 0; final int AB = 1; int BB = 2; if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { AA = 2; @@ -302,20 +315,37 @@ public class ReferenceConfidenceModel { private List getPileupsOverReference(final Haplotype refHaplotype, final Collection calledHaplotypes, final GenomeLoc paddedReferenceLoc, + final ActiveRegion activeRegion, final GenomeLoc activeRegionSpan, final Map stratifiedReadMap) { - final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); - final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); - writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves - writer.writeReadsAlignedToHaplotypes(calledHaplotypes.isEmpty() ? Collections.singleton(refHaplotype) : calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); - final List realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); + + if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); + if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); + if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); + if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); + if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + + List realignedReads; + + if( calledHaplotypes.size() == 1 ) { // only contains ref haplotype so an optimization is to just trust the alignments to the reference haplotype as provided by the aligner + realignedReads = activeRegion.getReads(); + } else { + final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); + final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); + writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves + writer.setOnlyRealignInformativeReads(true); + writer.writeReadsAlignedToHaplotypes(calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); + realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); + } if ( debuggingWriter != null ) for ( final GATKSAMRecord read : realignedReads ) debuggingWriter.addAlignment(read); final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, - false, genomeLocParser, samples, false); + true, genomeLocParser, samples, false); final List pileups = new LinkedList<>(); final int startPos = activeRegionSpan.getStart(); @@ -378,7 +408,7 @@ public class ReferenceConfidenceModel { final byte refBase = refBases[refStart + i]; if ( readBase != refBase ) { sum += readQuals[readStart + i]; - if ( sum > maxSum ) + if ( sum > maxSum ) // abort early return sum; } } @@ -403,7 +433,10 @@ public class ReferenceConfidenceModel { final byte[] refBases, final int refStart, final int maxIndelSize) { - // todo -- fast exit when n bases left < maxIndelSize + // fast exit when n bases left < maxIndelSize + if( readBases.length - readStart < maxIndelSize || refBases.length - refStart < maxIndelSize ) { + return false; + } final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE); @@ -445,12 +478,16 @@ public class ReferenceConfidenceModel { final int offset = p.getOffset(); // doesn't count as evidence - if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() ) + if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() ) continue; // todo -- this code really should handle CIGARs directly instead of relying on the above tests - if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize)) + if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) { nInformative++; + if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) { + return MAX_N_INDEL_INFORMATIVE_READS; + } + } } return nInformative; } diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java index 8f509b36b..8ee3c166c 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java @@ -230,6 +230,7 @@ public class GVCFWriter implements VariantContextWriter { gb.DP(block.getMedianDP()); gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP()); gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ()); + gb.PL(block.getMinPLs()); return vcb.genotypes(gb.make()).make(); } diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java index 282e49217..ebd167a31 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java @@ -69,10 +69,11 @@ import java.util.List; */ final class HomRefBlock { private final VariantContext startingVC; - int stop; + private int stop; private final int minGQ, maxGQ; - private List GQs = new ArrayList<>(100); - private List DPs = new ArrayList<>(100); + private int[] minPLs = null; + final private List GQs = new ArrayList<>(100); + final private List DPs = new ArrayList<>(100); private final Allele ref; /** @@ -116,9 +117,23 @@ final class HomRefBlock { public void add(final int pos, final Genotype g) { if ( g == null ) throw new IllegalArgumentException("g cannot be null"); if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); + if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field"); if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field"); if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); + if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs = PL.clone(); + } + } else { // otherwise take the min with the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs[0] = Math.min(minPLs[0], PL[0]); + minPLs[1] = Math.min(minPLs[1], PL[1]); + minPLs[2] = Math.min(minPLs[2], PL[2]); + } + } stop = pos; GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission DPs.add(g.getDP()); @@ -141,6 +156,8 @@ final class HomRefBlock { public int getMinDP() { return MathUtils.arrayMin(DPs); } /** Get the median DP observed within this band */ public int getMedianDP() { return MathUtils.median(DPs); } + /** Get the min PLs observed within this band, can be null if no PLs have yet been observed */ + public int[] getMinPLs() { return minPLs; } protected int getGQUpperBound() { return maxGQ; } protected int getGQLowerBound() { return minGQ; } diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java index c298485f6..1e6d7c13c 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -46,7 +46,6 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; -import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index 509399fd9..df523b74d 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -77,8 +77,9 @@ public abstract class HaplotypeBAMWriter { protected final static String READ_GROUP_ID = "ArtificialHaplotype"; protected final static String HAPLOTYPE_TAG = "HC"; - final ReadDestination output; - boolean writeHaplotypesAsWell = true; + private final ReadDestination output; + private boolean writeHaplotypesAsWell = true; + private boolean onlyRealignInformativeReads = false; /** * Possible modes for writing haplotypes to BAMs @@ -181,9 +182,16 @@ public abstract class HaplotypeBAMWriter { final Haplotype haplotype, final int referenceStart, final boolean isInformative) { - final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); - if ( alignedToRef != null ) - output.add(alignedToRef); + if( onlyRealignInformativeReads && !isInformative ) { + if( originalRead != null ) { + output.add(originalRead); + } + } else { + final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); + if ( alignedToRef != null ) { + output.add(alignedToRef); + } + } } /** @@ -305,7 +313,15 @@ public abstract class HaplotypeBAMWriter { return writeHaplotypesAsWell; } - public void setWriteHaplotypesAsWell(boolean writeHaplotypesAsWell) { + public void setWriteHaplotypesAsWell(final boolean writeHaplotypesAsWell) { this.writeHaplotypesAsWell = writeHaplotypesAsWell; } + + public boolean getOnlyRealignInformativeReads() { + return onlyRealignInformativeReads; + } + + public void setOnlyRealignInformativeReads(final boolean onlyRealignInformativeReads) { + this.onlyRealignInformativeReads = onlyRealignInformativeReads; + } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 37dc7adba..9f8b72c1d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -46,14 +46,25 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.Arrays; public class VariantAnnotatorIntegrationTest extends WalkerTest { + final static String REF = b37KGReference; + final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + public static String baseTestString() { return "-T VariantAnnotator -R " + b36KGReference + " --no_cmdline_in_header -o %s"; } @@ -290,4 +301,48 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("Testing InbreedingCoeff annotation with PED file", spec); } + @Test + public void testStrandBiasBySample() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("testStrandBiasBySample", spec).getFirst().get(0); + + final String baseNoFS = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA FisherStrand -A StrandBiasBySample"; + final WalkerTestSpec specNoFS = new WalkerTestSpec(baseNoFS, 1, Arrays.asList("")); + specNoFS.disableShadowBCF(); + final File outputVCFNoFS = executeTest("testStrandBiasBySample component stand bias annotation", specNoFS).getFirst().get(0); + + final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoFS.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A FisherStrand"; + final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("")); + specAnn.disableShadowBCF(); + final File outputVCFAnn = executeTest("testStrandBiasBySample re-annotation of FisherStrand", specAnn).getFirst().get(0); + + // confirm that the FisherStrand values are identical for the two pipelines + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + final VCFCodec codecAnn = new VCFCodec(); + final FileInputStream sAnn = new FileInputStream(outputVCFAnn); + final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); + codecAnn.readHeader(lineIteratorAnn); + + while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + final String lineAnn = lineIteratorAnn.next(); + Assert.assertFalse(lineAnn == null); + final VariantContext vcAnn = codecAnn.decode(lineAnn); + + Assert.assertTrue(vc.hasAttribute("FS")); + Assert.assertTrue(vcAnn.hasAttribute("FS")); + Assert.assertEquals(vc.getAttributeAsDouble("FS", 0.0), vcAnn.getAttributeAsDouble("FS", -1.0)); + } + + Assert.assertFalse(lineIterator.hasNext()); + Assert.assertFalse(lineIteratorAnn.hasNext()); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java index 355a47cbc..445864380 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java @@ -129,7 +129,7 @@ public class IndelGenotypeLikelihoodsUnitTest extends BaseTest { } private List getConsensusAlleles(int eventLength, boolean isInsertion, int minCnt, double minFraction, String altBases) { - final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(pileupProvider.genomeLocParser, true, minCnt, minFraction); + final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, minCnt, minFraction); return counter.computeConsensusAlleles(pileupProvider.referenceContext, pileupProvider.getAlignmentContextFromAlleles(isInsertion?eventLength:-eventLength,altBases,numReadsPerAllele), AlignmentContextUtils.ReadOrientation.COMPLETE); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index e73c04d2c..e430cd8d1 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -57,18 +57,18 @@ import java.util.List; public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @DataProvider(name = "MyDataProvider") public Object[][] makeMyDataProvider() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ce9c42e7e97a45a82315523dbd77fcf"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "e32b7fc4de29ed141dcafc0d789d5ed6"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "ecac86e8ef4856e6dfa306c436e9b545"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "c5a55196e10680a02c833a8a44733306"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9b9923ef41bfc7346c905fdecf918f92"}); tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "7828256b82df377cc3a26a55dbf68f91"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "e41e0acf172a994e938a150390badd39"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"}); return tests.toArray(new Object[][]{}); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 5824e905f..f7056ef58 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -136,16 +136,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) { - final List> VCs = new ArrayList>(); + final List> VCs = new ArrayList<>(); try { for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) { - VCs.add(new Pair(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); + VCs.add(new Pair<>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); } } catch( IOException e ) { throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read."); } - final Set> VCsAsSet = new HashSet>(VCs); + final Set> VCsAsSet = new HashSet<>(VCs); return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events. } @@ -233,7 +233,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("f3e636d64042e766cc6515987e85a968")); + Arrays.asList("a43d6226a51eb525f0774f88e3778189")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -256,7 +256,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestAggressivePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("ab49f80783e5db5f9ab6b13ba2ad00cb")); + Arrays.asList("19c2992541ede7407192660fdc1fadbf")); executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); } @@ -264,7 +264,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestConservativePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("16f7ffa063511c70bad795639a1c2638")); + Arrays.asList("f4ab037915db3a40ba26e9ee30d40e16")); executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index 3402b73f0..21648b2b9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -58,10 +58,10 @@ import java.util.List; public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { @DataProvider(name = "NCTDataProvider") public Object[][] makeNCTDataProvider() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "e4bf389676fa090c95980349310ba5ca"}); + tests.add(new Object[]{nct, "29cb04cca87f42b4762c34dfea5d15b7"}); } return tests.toArray(new Object[][]{}); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java index 3a4ed7e59..5c6ae93e5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -99,7 +99,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { @DataProvider(name = "CalcNIndelInformativeReadsData") public Object[][] makeMyDataProvider() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); { // very basic testing final String ref = "ACGT"; @@ -187,7 +187,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0}); Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0); - for ( int i = 1; i < 10000; i++ ) { + for ( int i = 1; i <= ReferenceConfidenceModel.MAX_N_INDEL_INFORMATIVE_READS; i++ ) { final GenotypeLikelihoods current = model.getIndelPLs(i); final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF); final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF); @@ -379,7 +379,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(refModel.getEnd(), loc.getStart() + i); Assert.assertFalse(refModel.hasLog10PError()); Assert.assertEquals(refModel.getAlternateAlleles().size(), 1); - Assert.assertEquals(refModel.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(refModel.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); Assert.assertTrue(refModel.hasGenotype(sample)); final Genotype g = refModel.getGenotype(sample); @@ -388,7 +388,6 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(g.getDP(), expectedDP); Assert.assertTrue(g.hasGQ()); Assert.assertTrue(g.hasPL()); - Assert.assertTrue(g.hasExtendedAttribute(ReferenceConfidenceModel.INDEL_INFORMATIVE_DEPTH)); } final VariantContext vc = call == null ? refModel : call; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 917cbd542..66bc74caa 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -67,7 +67,11 @@ public class CombineVariantsIntegrationTest extends WalkerTest { // TODO TODO TODO TODO TODO TODO TODO TODO // private static String baseTestString(String args) { - return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; + return baseTestString(args, b36KGReference); + } + + private static String baseTestString(String args, String ref) { + return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args; //return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -U LENIENT_VCF_PROCESSING -R " + b36KGReference + args; } @@ -181,6 +185,19 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } + @Test public void combineSingleSamplePipelineGVCF() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -multipleAllelesMergeType MIX_TYPES" + + " --excludeNonVariants -combineAnnotations -setKey null" + + " -L 20:10,000,000-10,001,000", b37KGReference), + 1, + Arrays.asList("2e15db35359144683f1e58e147362679")); + cvExecuteTest("combineSingleSamplePipelineGVCF", spec, true); + } + @Test public void combineDBSNPDuplicateSites() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java index ffbc3c43f..e353739e5 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java @@ -49,7 +49,6 @@ package org.broadinstitute.sting.utils.gvcf; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; @@ -91,7 +90,7 @@ public class GVCFWriterUnitTest extends BaseTest { private List standardPartition = Arrays.asList(1, 10, 20); private Allele REF = Allele.create("N", true); private Allele ALT = Allele.create("A"); - private List ALLELES = Arrays.asList(REF, ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + private List ALLELES = Arrays.asList(REF, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); private final String SAMPLE_NAME = "XXYYZZ"; @BeforeMethod @@ -223,10 +222,10 @@ public class GVCFWriterUnitTest extends BaseTest { Assert.assertEquals(vc.getStart(), start); Assert.assertEquals(vc.getEnd(), stop); if ( nonRef ) { - Assert.assertNotEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertNotEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); } else { Assert.assertEquals(vc.getNAlleles(), 2); - Assert.assertEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); Assert.assertEquals(vc.getAttributeAsInt(GVCFWriter.BLOCK_SIZE_INFO_FIELD, -1), stop - start + 1); Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop); Assert.assertTrue(vc.hasGenotypes()); @@ -234,8 +233,9 @@ public class GVCFWriterUnitTest extends BaseTest { Assert.assertEquals(vc.getGenotypes().size(), 1); final Genotype g = vc.getGenotype(SAMPLE_NAME); Assert.assertEquals(g.hasAD(), false); - Assert.assertEquals(g.hasLikelihoods(), false); - Assert.assertEquals(g.hasPL(), false); + Assert.assertEquals(g.hasLikelihoods(), true); + Assert.assertEquals(g.hasPL(), true); + Assert.assertEquals(g.getPL().length == 3, true); Assert.assertEquals(g.hasDP(), true); Assert.assertEquals(g.hasGQ(), true); } @@ -307,7 +307,7 @@ public class GVCFWriterUnitTest extends BaseTest { @DataProvider(name = "BandPartitionData") public Object[][] makeBandPartitionData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); tests.add(new Object[]{null, false}); tests.add(new Object[]{Collections.emptyList(), false}); diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java index 239aa93b5..ec4797f3d 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java @@ -85,32 +85,34 @@ public class HomRefBlockUnitTest extends BaseTest { @Test public void testMinMedian() { + //TODO - might be better to make this test use a data provider? final HomRefBlock band = new HomRefBlock(vc, 10, 20); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); int pos = vc.getStart(); - band.add(pos++, gb.DP(10).GQ(11).make()); + band.add(pos++, gb.DP(10).GQ(11).PL(new int[]{0,11,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 10, 11, 11); - band.add(pos++, gb.DP(11).GQ(10).make()); + band.add(pos++, gb.DP(11).GQ(10).PL(new int[]{0,10,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 11, 10, 11); - band.add(pos++, gb.DP(12).GQ(12).make()); + band.add(pos++, gb.DP(12).GQ(12).PL(new int[]{0,12,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 11, 10, 11); - band.add(pos++, gb.DP(13).GQ(15).make()); + band.add(pos++, gb.DP(13).GQ(15).PL(new int[]{0,15,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); - band.add(pos++, gb.DP(14).GQ(16).make()); + band.add(pos++, gb.DP(14).GQ(16).PL(new int[]{0,16,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); - band.add(pos++, gb.DP(15).GQ(17).make()); + band.add(pos++, gb.DP(15).GQ(17).PL(new int[]{0,17,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); - band.add(pos++, gb.DP(16).GQ(18).make()); + band.add(pos++, gb.DP(16).GQ(18).PL(new int[]{0,18,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 13, 10, 15); Assert.assertEquals(band.getSize(), pos - vc.getStart()); + Assert.assertTrue(Arrays.equals(band.getMinPLs(), new int[]{0,10,100})); } @Test @@ -118,7 +120,7 @@ public class HomRefBlockUnitTest extends BaseTest { final HomRefBlock band = new HomRefBlock(vc, 10, 20); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); - band.add(vc.getStart(), gb.DP(1000).GQ(1000).make()); + band.add(vc.getStart(), gb.DP(1000).GQ(1000).PL(new int[]{0,10,100}).make()); assertValues(band, 1000, 1000, 99, 99); } @@ -127,7 +129,7 @@ public class HomRefBlockUnitTest extends BaseTest { final HomRefBlock band = new HomRefBlock(vc, 10, 20); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); - band.add(vc.getStart() + 10, gb.DP(10).GQ(11).make()); + band.add(vc.getStart() + 10, gb.DP(10).GQ(11).PL(new int[]{0,10,100}).make()); } private void assertValues(final HomRefBlock band, final int minDP, final int medianDP, final int minGQ, final int medianGQ) { @@ -140,7 +142,7 @@ public class HomRefBlockUnitTest extends BaseTest { @DataProvider(name = "ContiguousData") public Object[][] makeContiguousData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); for ( final String chrMod : Arrays.asList("", ".mismatch") ) { for ( final int offset : Arrays.asList(-10, -1, 0, 1, 10) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index b85365366..f8628bb78 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -86,7 +86,7 @@ public final class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); + private final LinkedList workQueue = new LinkedList<>(); private TAROrderedReadCache myReads = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 10ba4ca17..f2f808cad 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; @@ -83,6 +84,7 @@ import java.util.*; @Requires(value={}) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @Reference(window=@Window(start=-50,stop=50)) +@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) @By(DataSource.REFERENCE) public class VariantAnnotator extends RodWalker implements AnnotatorCompatible, TreeReducible { @@ -132,21 +134,21 @@ public class VariantAnnotator extends RodWalker implements Ann * See the -list argument to view available annotations. */ @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) - protected List annotationsToUse = new ArrayList(); + protected List annotationsToUse = new ArrayList<>(); /** * Note that this argument has higher priority than the -A or -G arguments, * so annotations will be excluded even if they are explicitly included with the other options. */ @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList(); + protected List annotationsToExclude = new ArrayList<>(); /** * If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups. * Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs. */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - protected List annotationGroupsToUse = new ArrayList(); + protected List annotationGroupsToUse = new ArrayList<>(); /** * This option enables you to add annotations from one VCF to another. @@ -193,8 +195,8 @@ public class VariantAnnotator extends RodWalker implements Ann } // get the list of all sample names from the variant VCF input rod, if applicable - List rodName = Arrays.asList(variantCollection.variants.getName()); - Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); + final List rodName = Arrays.asList(variantCollection.variants.getName()); + final Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); if ( USE_ALL_ANNOTATIONS ) engine = new VariantAnnotatorEngine(annotationsToExclude, this, getToolkit()); @@ -204,23 +206,23 @@ public class VariantAnnotator extends RodWalker implements Ann // setup the header fields // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones - Set hInfo = new HashSet(); + final Set hInfo = new HashSet<>(); hInfo.addAll(engine.getVCFAnnotationDescriptions()); - for ( VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { + for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } // for the expressions, pull the info header line from the header of the resource rod - for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) { + for ( final VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) { // special case the ID field if ( expression.fieldName.equals("ID") ) { hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource")); continue; } VCFInfoHeaderLine targetHeaderLine = null; - for ( VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { + for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { if ( line instanceof VCFInfoHeaderLine ) { - VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; + final VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; if ( infoline.getID().equals(expression.fieldName) ) { targetHeaderLine = infoline; break; @@ -285,7 +287,7 @@ public class VariantAnnotator extends RodWalker implements Ann Map stratifiedContexts; if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) { stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup()); - annotatedVCs = new ArrayList(VCs.size()); + annotatedVCs = new ArrayList<>(VCs.size()); for ( VariantContext vc : VCs ) annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 078a36dd9..25e683c2f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -58,15 +58,15 @@ public class VariantAnnotatorEngine { public RodBinding binding; public VAExpression(String fullExpression, List> bindings) { - int indexOfDot = fullExpression.lastIndexOf("."); + final int indexOfDot = fullExpression.lastIndexOf("."); if ( indexOfDot == -1 ) throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format"); fullName = fullExpression; fieldName = fullExpression.substring(indexOfDot+1); - String bindingName = fullExpression.substring(0, indexOfDot); - for ( RodBinding rod : bindings ) { + final String bindingName = fullExpression.substring(0, indexOfDot); + for ( final RodBinding rod : bindings ) { if ( rod.getName().equals(bindingName) ) { binding = rod; break; @@ -96,7 +96,7 @@ public class VariantAnnotatorEngine { // select specific expressions to use public void initializeExpressions(Set expressionsToUse) { // set up the expressions - for ( String expression : expressionsToUse ) + for ( final String expression : expressionsToUse ) requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); } @@ -113,15 +113,15 @@ public class VariantAnnotatorEngine { if ( annotationsToExclude.size() == 0 ) return; - List tempRequestedInfoAnnotations = new ArrayList(requestedInfoAnnotations.size()); - for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) { + final List tempRequestedInfoAnnotations = new ArrayList<>(requestedInfoAnnotations.size()); + for ( final InfoFieldAnnotation annotation : requestedInfoAnnotations ) { if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) ) tempRequestedInfoAnnotations.add(annotation); } requestedInfoAnnotations = tempRequestedInfoAnnotations; - List tempRequestedGenotypeAnnotations = new ArrayList(requestedGenotypeAnnotations.size()); - for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + final List tempRequestedGenotypeAnnotations = new ArrayList<>(requestedGenotypeAnnotations.size()); + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) ) tempRequestedGenotypeAnnotations.add(annotation); } @@ -143,24 +143,24 @@ public class VariantAnnotatorEngine { variantOverlapAnnotator = new VariantOverlapAnnotator(dbSNPBinding, overlapBindings, engine.getGenomeLocParser()); } - public void invokeAnnotationInitializationMethods( Set headerLines ) { - for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) { + public void invokeAnnotationInitializationMethods( final Set headerLines ) { + for ( final VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) { annotation.initialize(walker, toolkit, headerLines); } - for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) { + for ( final VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) { annotation.initialize(walker, toolkit, headerLines); } } public Set getVCFAnnotationDescriptions() { - Set descriptions = new HashSet(); + final Set descriptions = new HashSet<>(); - for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) + for ( final InfoFieldAnnotation annotation : requestedInfoAnnotations ) descriptions.addAll(annotation.getDescriptions()); - for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) descriptions.addAll(annotation.getDescriptions()); - for ( String db : variantOverlapAnnotator.getOverlapNames() ) { + for ( final String db : variantOverlapAnnotator.getOverlapNames() ) { if ( VCFStandardHeaderLines.getInfoLine(db, false) != null ) descriptions.add(VCFStandardHeaderLines.getInfoLine(db)); else @@ -170,10 +170,10 @@ public class VariantAnnotatorEngine { return descriptions; } - public VariantContext annotateContext(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final Map stratifiedContexts, - VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { return annotateContext(tracker, ref, stratifiedContexts, vc, null); } @@ -182,20 +182,20 @@ public class VariantAnnotatorEngine { final Map stratifiedContexts, final VariantContext vc, final Map perReadAlleleLikelihoodMap) { - Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); + final Map infoAnnotations = new LinkedHashMap<>(vc.getAttributes()); // annotate expressions where available annotateExpressions(tracker, ref.getLocus(), infoAnnotations); // go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { - Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap); + for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + final Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap); if ( annotationsFromCurrentType != null ) infoAnnotations.putAll(annotationsFromCurrentType); } // generate a new annotated VC - VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); + final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process final VariantContext annotated = builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make(); @@ -210,11 +210,11 @@ public class VariantAnnotatorEngine { final Map infoAnnotations = new LinkedHashMap<>(vc.getAttributes()); // go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { if ( !(annotationType instanceof ActiveRegionBasedAnnotation) ) continue; - Map annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc); + final Map annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc); if ( annotationsFromCurrentType != null ) { infoAnnotations.putAll(annotationsFromCurrentType); } @@ -244,12 +244,12 @@ public class VariantAnnotatorEngine { } private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map infoAnnotations) { - for ( VAExpression expression : requestedExpressions ) { - Collection VCs = tracker.getValues(expression.binding, loc); + for ( final VAExpression expression : requestedExpressions ) { + final Collection VCs = tracker.getValues(expression.binding, loc); if ( VCs.size() == 0 ) continue; - VariantContext vc = VCs.iterator().next(); + final VariantContext vc = VCs.iterator().next(); // special-case the ID field if ( expression.fieldName.equals("ID") ) { if ( vc.hasID() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 45dbc937d..396d5686b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -164,6 +164,9 @@ public class CombineVariants extends RodWalker implements Tree @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) public boolean minimalVCF = false; + @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false) + public boolean EXCLUDE_NON_VARIANTS = false; + /** * Set to 'null' if you don't want the set field emitted. */ @@ -171,7 +174,7 @@ public class CombineVariants extends RodWalker implements Tree public String SET_KEY = "set"; /** - * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime.. + * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime. */ @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) public boolean ASSUME_IDENTICAL_SAMPLES = false; @@ -188,6 +191,9 @@ public class CombineVariants extends RodWalker implements Tree @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; + @Argument(fullName="combineAnnotations", shortName="combineAnnotations", doc="If true, combine the annotation values in some straightforward manner assuming the input callsets are i.i.d.", required=false) + public boolean COMBINE_ANNOTATIONS = false; + private List priority = null; /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ @@ -238,7 +244,7 @@ public class CombineVariants extends RodWalker implements Tree throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); if ( PRIORITY_STRING != null){ - priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); + priority = new ArrayList<>(Arrays.asList(PRIORITY_STRING.split(","))); if ( rodNames.size() != priority.size() ) throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); @@ -252,13 +258,16 @@ public class CombineVariants extends RodWalker implements Tree if ( tracker == null ) // RodWalkers can make funky map calls return 0; - Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); + final Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus Collection vcs = tracker.getValues(variants, context.getLocation()); + Collection potentialRefVCs = tracker.getValues(variants); + potentialRefVCs.removeAll(vcs); if ( sitesOnlyVCF ) { vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); + potentialRefVCs = VariantContextUtils.sitesOnlyVariantContexts(potentialRefVCs); } if ( ASSUME_IDENTICAL_SAMPLES ) { @@ -270,7 +279,7 @@ public class CombineVariants extends RodWalker implements Tree } int numFilteredRecords = 0; - for (VariantContext vc : vcs) { + for (final VariantContext vc : vcs) { if (vc.filtersWereApplied() && vc.isFiltered()) numFilteredRecords++; } @@ -278,16 +287,16 @@ public class CombineVariants extends RodWalker implements Tree if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) return 0; - List mergedVCs = new ArrayList(); + final List mergedVCs = new ArrayList<>(); if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { - Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); + final Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); // TODO -- clean this up in a refactoring // merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type) if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) { final List refs = VCsByType.remove(VariantContext.Type.NO_VARIATION); - for ( VariantContext.Type type : VariantContext.Type.values() ) { + for ( final VariantContext.Type type : VariantContext.Type.values() ) { if ( VCsByType.containsKey(type) ) { VCsByType.get(type).addAll(refs); break; @@ -296,23 +305,27 @@ public class CombineVariants extends RodWalker implements Tree } // iterate over the types so that it's deterministic - for (VariantContext.Type type : VariantContext.Type.values()) { - if (VCsByType.containsKey(type)) - mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), - priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + for (final VariantContext.Type type : VariantContext.Type.values()) { + // make sure that it is a variant or in case it is not, that we want to include the sites with no variants + if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) { + if (VCsByType.containsKey(type)) { + mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), potentialRefVCs, + priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); + } + } } } else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, + mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, potentialRefVCs, priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); } else { logger.warn("Ignoring all records at site " + ref.getLocus()); } - for ( VariantContext mergedVC : mergedVCs ) { + for ( final VariantContext mergedVC : mergedVCs ) { // only operate at the start of events if ( mergedVC == null ) continue; @@ -320,9 +333,12 @@ public class CombineVariants extends RodWalker implements Tree final VariantContextBuilder builder = new VariantContextBuilder(mergedVC); // re-compute chromosome counts VariantContextUtils.calculateChromosomeCounts(builder, false); + if ( minimalVCF ) GATKVariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(builder.make()); + final VariantContext vc = builder.make(); + if( !EXCLUDE_NON_VARIANTS || vc.isPolymorphicInSamples() ) + vcfWriter.add(builder.make()); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 3af71eabb..bfae7e94c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -845,17 +845,29 @@ public class MathUtils { } /** - * Compute the median element of the array of integers + * Compute the median element of the list of integers * @param array a list of integers * @return the median element */ - public static int median(final List array) { + public static > T median(final List array) { + /* TODO -- from Valentin + the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2). + + But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia] + + My suggestion for a solution is then: + + unify median and medianDoubles to public static T median(Collection) + check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc. + relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2) + In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n) + */ if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); final int size = array.size(); if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); else if ( size == 1 ) return array.get(0); else { - final ArrayList sorted = new ArrayList<>(array); + final ArrayList sorted = new ArrayList<>(array); Collections.sort(sorted); return sorted.get(size / 2); } @@ -1405,7 +1417,7 @@ public class MathUtils { * @return */ public static List log10LinearRange(final int start, final int stop, final double eps) { - final LinkedList values = new LinkedList(); + final LinkedList values = new LinkedList<>(); final double log10range = Math.log10(stop - start); if ( start == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index f4c673e61..8a034dde0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -46,7 +46,7 @@ import java.util.List; * Time: 8:54:05 AM */ public class PileupElement implements Comparable { - private final static LinkedList EMPTY_LINKED_LIST = new LinkedList(); + private final static LinkedList EMPTY_LINKED_LIST = new LinkedList<>(); private final static EnumSet ON_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D); diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java index 3a8afca8c..4cf39d6be 100644 --- a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java @@ -30,7 +30,7 @@ import net.sf.samtools.Cigar; /** * Generic interface for SmithWaterman calculations * - * This interface allows clients to use a generic SmithWaterman variable, without propogating the specific + * This interface allows clients to use a generic SmithWaterman variable, without propagating the specific * implementation of SmithWaterman throughout their code: * * SmithWaterman sw = new SpecificSmithWatermanImplementation(ref, read, params) diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 3bc5da82f..e8c438a53 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -45,7 +45,11 @@ public class GATKVariantContextUtils { public static final int DEFAULT_PLOIDY = 2; public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - protected static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + + public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; + public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site + public final static String MERGE_FILTER_PREFIX = "filterIn"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; @@ -108,7 +112,7 @@ public class GATKVariantContextUtils { int averageLengthNum = 0; int averageLengthDenom = 0; int refLength = vc.getReference().length(); - for ( Allele a : vc.getAlternateAlleles() ) { + for ( final Allele a : vc.getAlternateAlleles() ) { int numAllele = vc.getCalledChrCount(a); int alleleSize; if ( a.length() == refLength ) { @@ -182,8 +186,8 @@ public class GATKVariantContextUtils { */ public static VariantContext reverseComplement(VariantContext vc) { // create a mapping from original allele to reverse complemented allele - HashMap alleleMap = new HashMap(vc.getAlleles().size()); - for ( Allele originalAllele : vc.getAlleles() ) { + HashMap alleleMap = new HashMap<>(vc.getAlleles().size()); + for ( final Allele originalAllele : vc.getAlleles() ) { Allele newAllele; if ( originalAllele.isNoCall() ) newAllele = originalAllele; @@ -195,8 +199,8 @@ public class GATKVariantContextUtils { // create new Genotype objects GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getAlleles() ) { + List newAlleles = new ArrayList<>(); + for ( final Allele allele : genotype.getAlleles() ) { Allele newAllele = alleleMap.get(allele); if ( newAllele == null ) newAllele = Allele.NO_CALL; @@ -267,7 +271,7 @@ public class GATKVariantContextUtils { final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); byte[] repeatUnit = null; - final ArrayList lengths = new ArrayList(); + final ArrayList lengths = new ArrayList<>(); for ( final Allele allele : vc.getAlternateAlleles() ) { Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); @@ -317,7 +321,7 @@ public class GATKVariantContextUtils { repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; - return new Pair(repetitionCount, repeatUnit); + return new Pair<>(repetitionCount, repeatUnit); } @@ -528,7 +532,7 @@ public class GATKVariantContextUtils { } else { newLikelihoods = new double[likelihoodIndexesToUse.size()]; int newIndex = 0; - for ( int oldIndex : likelihoodIndexesToUse ) + for ( final int oldIndex : likelihoodIndexesToUse ) newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; // might need to re-normalize @@ -718,6 +722,7 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, @@ -728,9 +733,10 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { + final boolean mergeInfoWithMaxAC, + final boolean combineAnnotations ) { int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); - return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); + return simpleMerge(unsortedVCs, Collections.emptyList(), priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC, combineAnnotations); } /** @@ -738,11 +744,12 @@ public class GATKVariantContextUtils { * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name. * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use - * SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge. + * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. * * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ * * @param unsortedVCs collection of unsorted VCs + * @param potentialRefVCs collection of unsorted VCs that overlap this locus which should only be searched for potential reference records * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes @@ -751,9 +758,11 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, + final Collection potentialRefVCs, final List priorityListOfVCs, final int originalNumOfVCs, final FilteredRecordMergeType filteredRecordMergeType, @@ -762,7 +771,8 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { + final boolean mergeInfoWithMaxAC, + final boolean combineAnnotations ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -775,12 +785,16 @@ public class GATKVariantContextUtils { final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary - final List VCs = new ArrayList(); + List VCs = new ArrayList<>(); for (final VariantContext vc : preFilteredVCs) { if ( ! filteredAreUncalled || vc.isNotFiltered() ) VCs.add(vc); } + + // cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible + VCs = fillInNonRefSymbolicAlleles(VCs, potentialRefVCs); + if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -789,17 +803,18 @@ public class GATKVariantContextUtils { final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); - final Set alleles = new LinkedHashSet(); - final Set filters = new HashSet(); - final Map attributes = new LinkedHashMap(); - final Set inconsistentAttributes = new HashSet(); - final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant - final Set rsIDs = new LinkedHashSet(1); // most of the time there's one id + final Set alleles = new LinkedHashSet<>(); + final Set filters = new HashSet<>(); + final Map attributes = new LinkedHashMap<>(); + final Set inconsistentAttributes = new HashSet<>(); + final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id VariantContext longestVC = first; int depth = 0; int maxAC = -1; - final Map attributesWithMaxAC = new LinkedHashMap(); + final Map attributesWithMaxAC = new LinkedHashMap<>(); + final Map> annotationMap = new LinkedHashMap<>(); double log10PError = CommonInfo.NO_LOG10_PERROR; boolean anyVCHadFiltersApplied = false; VariantContext vcWithMaxAC = null; @@ -811,7 +826,6 @@ public class GATKVariantContextUtils { boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches - for ( final VariantContext vc : VCs ) { if ( longestVC.getStart() != vc.getStart() ) throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); @@ -846,10 +860,10 @@ public class GATKVariantContextUtils { if ( vc.hasID() ) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); - // lets see if the string contains a , separator + // lets see if the string contains a "," separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { - List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); - for (String alleleCount : alleleCountArray) { + final List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); + for (final String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; @@ -866,21 +880,36 @@ public class GATKVariantContextUtils { } for (final Map.Entry p : vc.getAttributes().entrySet()) { - String key = p.getKey(); - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - final boolean alreadyFound = attributes.containsKey(key); - final Object boundValue = attributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + final String key = p.getKey(); + final Object value = p.getValue(); + boolean badAnnotation = false; + if ( combineAnnotations ) { // add the annotation values to a list for combining later + List values = annotationMap.get(key); + if( values == null ) { + values = new ArrayList<>(); + annotationMap.put(key, values); + } + try { + final String stringValue = value.toString(); + values.add(stringValue.contains(".") ? Double.parseDouble(stringValue) : Integer.parseInt(stringValue)); + } catch (NumberFormatException e) { + badAnnotation = true; + } + } + if ( ! combineAnnotations || badAnnotation ) { // only output annotations that have the same value in every input VC + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); - attributes.put(key, p.getValue()); + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + attributes.put(key, value); + } } } } @@ -906,6 +935,12 @@ public class GATKVariantContextUtils { // take the VC with the maxAC and pull the attributes into a modifiable map if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); + } else if ( combineAnnotations ) { // when combining annotations use the median value from all input VCs which had annotations provided + for ( final Map.Entry> p : annotationMap.entrySet() ) { + if ( ! p.getValue().isEmpty() ) { + attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); + } + } } // if at least one record was unfiltered and we want a union, clear all of the filters @@ -922,7 +957,7 @@ public class GATKVariantContextUtils { else if ( variantSources.isEmpty() ) // everyone was reference setValue = MERGE_REF_IN_ALL; else { - final LinkedHashSet s = new LinkedHashSet(); + final LinkedHashSet s = new LinkedHashSet<>(); for ( final VariantContext vc : VCs ) if ( vc.isVariant() ) s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); @@ -950,7 +985,12 @@ public class GATKVariantContextUtils { if ( anyVCHadFiltersApplied ) { builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); } - builder.attributes(new TreeMap(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + if( combineAnnotations ) { + // unfortunately some attributes are just too dangerous to try to combine together + builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY); + builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + } // Trim the padded bases of all alleles if necessary final VariantContext merged = builder.make(); @@ -958,6 +998,68 @@ public class GATKVariantContextUtils { return merged; } + private static final Comparable combineAnnotationValues( final List array ) { + return MathUtils.median(array); // right now we take the median but other options could be explored + } + + /** + * cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible + * @param VCs the list of VCs in which to fill in symbolic alleles + * @param potentialRefVCs the list of VCs which are overlapping the current locus-- need to look for reference blocks and fill in with alternate alleles + * @return the list of VCs to merge in which all the NON_REF_SYMBOLIC_ALLELEs have been replaced with the correct alternate allele + */ + protected static final List fillInNonRefSymbolicAlleles( final List VCs, final Collection potentialRefVCs ) { + if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null"); } + if( potentialRefVCs == null ) { throw new IllegalArgumentException("potentialRefVCs cannot be null"); } + + final List VCsToReturn = new ArrayList<>(VCs.size()); + boolean containsNonRefSymbolicAllele = false; + VariantContext nonRefVC = null; + for( final VariantContext vc : VCs ) { + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + containsNonRefSymbolicAllele = true; + } else if ( nonRefVC == null ) { + nonRefVC = vc; + } + if( nonRefVC != null && containsNonRefSymbolicAllele == true ) { + break; // break out so that we don't run over the whole list unnecessarily + } + } + for( final VariantContext vc : potentialRefVCs ) { + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + containsNonRefSymbolicAllele = true; + VCs.add(vc); // add the overlapping non-ref symbolic records to the VCs list in order to be filled in below + } + } + + if( !containsNonRefSymbolicAllele ) { + return VCs; + } + + for( final VariantContext vc : VCs ) { + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { // create a new record based on the current record but instead has the symbolic allele replaced by the alternate allele for this site + if( nonRefVC != null ) { + final GenotypesContext genotypes = GenotypesContext.create(vc.getSampleNames().size()); + int depth = 0; + for( final String sample : vc.getSampleNames() ) { + final Genotype gt = vc.getGenotype(sample); + final ArrayList refAlleles = new ArrayList<>(2); + refAlleles.add(nonRefVC.getReference()); + refAlleles.add(nonRefVC.getReference()); + final int[] pl = ( nonRefVC.isBiallelic() ? gt.getPL() : null ); // PLs only works for biallelic sites for now + depth += ( gt.hasDP() ? gt.getDP() : Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) ); // DP is special-cased in CombineVariants so fill it in here + genotypes.add(new GenotypeBuilder(gt).alleles(refAlleles).PL(pl).make()); + } + VCsToReturn.add(new VariantContextBuilder(nonRefVC).attributes(null).attribute("DP", depth).genotypes(genotypes).make()); + } + } else { + VCsToReturn.add(vc); + } + } + + return VCsToReturn; + } + private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { final Iterator it1 = alleleSet1.iterator(); final Iterator it2 = alleleSet2.iterator(); @@ -989,8 +1091,8 @@ public class GATKVariantContextUtils { static private Allele determineReferenceAllele(List VCs) { Allele ref = null; - for ( VariantContext vc : VCs ) { - Allele myRef = vc.getReference(); + for ( final VariantContext vc : VCs ) { + final Allele myRef = vc.getReference(); if ( ref == null || ref.length() < myRef.length() ) ref = myRef; else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) @@ -1024,13 +1126,13 @@ public class GATKVariantContextUtils { // System.out.printf("myref %s%n", myRef ); // System.out.printf("extrabases %s%n", new String(extraBases)); - Map map = new HashMap(); - for ( Allele a : vc.getAlleles() ) { + Map map = new HashMap<>(); + for ( final Allele a : vc.getAlleles() ) { if ( a.isReference() ) map.put(a, refAllele); else { Allele extended = Allele.extend(a, extraBases); - for ( Allele b : allAlleles ) + for ( final Allele b : allAlleles ) if ( extended.equals(b) ) extended = b; // System.out.printf(" Extending %s => %s%n", a, extended); @@ -1050,23 +1152,23 @@ public class GATKVariantContextUtils { throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) - return new ArrayList(unsortedVCs); + return new ArrayList<>(unsortedVCs); else { - ArrayList sorted = new ArrayList(unsortedVCs); + ArrayList sorted = new ArrayList<>(unsortedVCs); Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); return sorted; } } - private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE - for ( Genotype g : oneVC.getGenotypes() ) { - String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); + for ( final Genotype g : oneVC.getGenotypes() ) { + final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); if ( ! mergedGenotypes.containsSample(name) ) { // only add if the name is new Genotype newG = g; - if ( uniqifySamples || alleleMapping.needsRemapping() ) { + if ( uniquifySamples || alleleMapping.needsRemapping() ) { final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); } @@ -1076,8 +1178,8 @@ public class GATKVariantContextUtils { } } - public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) { - return uniqify ? sampleName + "." + trackName : sampleName; + public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { + return uniquify ? sampleName + "." + trackName : sampleName; } /** @@ -1104,8 +1206,8 @@ public class GATKVariantContextUtils { * Trim the alleles in inputVC forward and reverse, as requested * * @param inputVC a non-null input VC whose alleles might need a haircut - * @param trimForward should we trim up the alleles from the foward direction? - * @param trimReverse shold we trim up the alleles from the reverse direction? + * @param trimForward should we trim up the alleles from the forward direction? + * @param trimReverse should we trim up the alleles from the reverse direction? * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles */ @Ensures("result != null") @@ -1140,8 +1242,8 @@ public class GATKVariantContextUtils { if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified return inputVC; - final List alleles = new LinkedList(); - final Map originalToTrimmedAlleleMap = new HashMap(); + final List alleles = new LinkedList<>(); + final Map originalToTrimmedAlleleMap = new HashMap<>(); for (final Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { @@ -1300,7 +1402,7 @@ public class GATKVariantContextUtils { } private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { - Map attributes = new HashMap(keysToPreserve.size()); + Map attributes = new HashMap<>(keysToPreserve.size()); for ( final String key : keysToPreserve ) { if ( igc.hasAttribute(key) ) attributes.put(key, igc.getAttribute(key)); @@ -1343,7 +1445,7 @@ public class GATKVariantContextUtils { if (!vc1.getReference().equals(vc2.getReference())) return false; - for (Allele a :vc1.getAlternateAlleles()) { + for (final Allele a :vc1.getAlternateAlleles()) { if (!vc2.getAlternateAlleles().contains(a)) return false; } @@ -1351,17 +1453,24 @@ public class GATKVariantContextUtils { return true; } - public static Map> separateVariantContextsByType(Collection VCs) { - HashMap> mappedVCs = new HashMap>(); - for ( VariantContext vc : VCs ) { + public static Map> separateVariantContextsByType( final Collection VCs ) { + if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null."); } + + final HashMap> mappedVCs = new HashMap<>(); + for ( final VariantContext vc : VCs ) { + VariantContext.Type vcType = vc.getType(); + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + if( vc.getAlternateAlleles().size() > 1 ) { throw new IllegalStateException("Reference records should not have more than one alternate allele"); } + vcType = VariantContext.Type.NO_VARIATION; + } // look at previous variant contexts of different type. If: // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) // c) neither: do nothing, just add vc to its own list boolean addtoOwnList = true; - for (VariantContext.Type type : VariantContext.Type.values()) { - if (type.equals(vc.getType())) + for (final VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vcType)) continue; if (!mappedVCs.containsKey(type)) @@ -1376,9 +1485,9 @@ public class GATKVariantContextUtils { // avoid having empty lists if (vcList.size() == 0) mappedVCs.remove(type); - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(otherVC); + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(otherVC); break; } else if (allelesAreSubset(vc,otherVC)) { @@ -1390,9 +1499,9 @@ public class GATKVariantContextUtils { } } if (addtoOwnList) { - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(vc); + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(vc); } } @@ -1403,10 +1512,10 @@ public class GATKVariantContextUtils { if ( allowedAttributes == null ) return vc; - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - Map attrs = new HashMap(); - for ( Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { + final Map attrs = new HashMap<>(); + for ( final Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { if ( allowedAttributes.contains(attr.getKey()) ) attrs.put(attr.getKey(), attr.getValue()); } @@ -1427,8 +1536,8 @@ public class GATKVariantContextUtils { public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } public List remap(List as) { - List newAs = new ArrayList(); - for ( Allele a : as ) { + List newAs = new ArrayList<>(); + for ( final Allele a : as ) { //System.out.printf(" Remapping %s => %s%n", a, remap(a)); newAs.add(remap(a)); } @@ -1467,7 +1576,7 @@ public class GATKVariantContextUtils { if ( alleleStrings == null || alleleStrings.isEmpty() ) throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); - final List alleles = new LinkedList(); + final List alleles = new LinkedList<>(); final int length = alleleStrings.get(0).length(); boolean first = true; @@ -1503,7 +1612,7 @@ public class GATKVariantContextUtils { if ( ref.length != alt.length ) throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); - final List result = new ArrayList(ref.length); + final List result = new ArrayList<>(ref.length); for ( int i = 0; i < ref.length; i++ ) { @@ -1518,7 +1627,7 @@ public class GATKVariantContextUtils { final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); // create new genotypes with updated alleles - final Map alleleMap = new HashMap(); + final Map alleleMap = new HashMap<>(); alleleMap.put(vc.getReference(), newRefAllele); alleleMap.put(vc.getAlternateAllele(0), newAltAllele); final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index f2718fb8c..a13797523 100644 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -500,7 +500,7 @@ public class MathUtilsUnitTest extends BaseTest { @DataProvider(name = "MedianData") public Object[][] makeMedianData() { - List tests = new ArrayList(); + final List tests = new ArrayList<>(); // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{Arrays.asList(10), 10}); @@ -510,12 +510,16 @@ public class MathUtilsUnitTest extends BaseTest { tests.add(new Object[]{values, 1}); } + for ( final List values : Utils.makePermutations(Arrays.asList(1.1,2.1,-3.1), 3, false) ) { + tests.add(new Object[]{values, 1.1}); + } + return tests.toArray(new Object[][]{}); } @Test(dataProvider = "MedianData") - public void testMedian(final List values, final int expected) { - final int actual = MathUtils.median(values); + public void testMedian(final List values, final Comparable expected) { + final Comparable actual = MathUtils.median(values); Assert.assertEquals(actual, expected, "Failed with " + values); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 937698d82..220e64f7d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -107,7 +107,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { private MergeAllelesTest(List... arg) { super(MergeAllelesTest.class); - LinkedList> all = new LinkedList>(Arrays.asList(arg)); + LinkedList> all = new LinkedList<>(Arrays.asList(arg)); expected = all.pollLast(); inputs = all; } @@ -185,7 +185,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false, false); Assert.assertEquals(merged.getAlleles(), cfg.expected); } @@ -243,7 +243,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false, false); Assert.assertEquals(merged.getID(), cfg.expected); } @@ -358,7 +358,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeFiltered(MergeFilteredTest cfg) { final List priority = vcs2priority(cfg.inputs); final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -485,7 +485,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeGenotypes(MergeGenotypesTest cfg) { final VariantContext merged = GATKVariantContextUtils.simpleMerge( cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -526,10 +526,10 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false, false); // test genotypes - Assert.assertEquals(merged.getSampleNames(), new HashSet(Arrays.asList("s1.1", "s1.2"))); + Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); } // TODO: remove after testing @@ -540,7 +540,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // // final VariantContext merged = VariantContextUtils.simpleMerge( // Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, -// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false); +// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); // } // -------------------------------------------------------------------------------- @@ -559,7 +559,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false, false); if ( annotate ) Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); @@ -570,7 +570,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } private static final List vcs2priority(final Collection vcs) { - final List priority = new ArrayList(); + final List priority = new ArrayList<>(); for ( final VariantContext vc : vcs ) { priority.add(vc.getSource()); @@ -997,7 +997,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "PrimitiveAlleleSplittingData") public Object[][] makePrimitiveAlleleSplittingData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); // no split tests.add(new Object[]{"A", "C", 0, null}); @@ -1039,6 +1039,26 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } + @Test(enabled = !DEBUG) + public void testFillInNonRefSymbolicAlleles() { + final int start = 10; + final String ref = "A"; + final String alt = "C"; + final VariantContext vcAlt = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); + final VariantContext vcRef = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, "<"+GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME+">")); + + List VCs = Arrays.asList(vcAlt, vcRef); + VCs = GATKVariantContextUtils.fillInNonRefSymbolicAlleles(VCs, Collections.emptyList()); + + // make sure the non ref symbolic alleles have all been filled in with the appropriate alternate allele + for( final VariantContext vc : VCs ) { + Assert.assertTrue(vc.getAlternateAlleles().size() == 1); + Assert.assertTrue(vc.getAlternateAllele(0).isNonReference()); + Assert.assertTrue(!vc.getReference().isSymbolic()); + Assert.assertTrue(!vc.getAlternateAllele(0).isSymbolic()); + } + } + // -------------------------------------------------------------------------------- // // test allele remapping @@ -1047,7 +1067,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "AlleleRemappingData") public Object[][] makeAlleleRemappingData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final Allele originalBase1 = Allele.create((byte)'A'); final Allele originalBase2 = Allele.create((byte)'T'); @@ -1055,7 +1075,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { for ( final byte base1 : BaseUtils.BASES ) { for ( final byte base2 : BaseUtils.BASES ) { for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { - Map map = new HashMap(2); + Map map = new HashMap<>(2); map.put(originalBase1, Allele.create(base1)); map.put(originalBase2, Allele.create(base2)); diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java index 381b282e0..a1b75a3f1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java @@ -147,7 +147,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { Set samples; public void run(final VariantContext vc) { if ( samples == null ) - samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); VariantContext sub = vc.subContextFromSamples(samples); sub.getNSamples(); } @@ -176,7 +176,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { Set samples; public void run(final VariantContext vc) { if ( samples == null ) - samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); vc.getGenotypes(samples).size(); } }; @@ -221,7 +221,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { case MERGE: return new FunctionToBenchmark() { public void run(final VariantContext vc) { - List toMerge = new ArrayList(); + List toMerge = new ArrayList<>(); for ( int i = 0; i < dupsToMerge; i++ ) { GenotypesContext gc = GenotypesContext.create(vc.getNSamples()); @@ -234,7 +234,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { GATKVariantContextUtils.simpleMerge(toMerge, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, - true, false, "set", false, true); + true, false, "set", false, true, false); } }; @@ -363,7 +363,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { // toMerge, null, // org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, // org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, -// true, false, "set", false, true); +// true, false, "set", false, true, false); // } // }; // From 8b829255e7526ed27ec8c2318288ae9789ab8c92 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Mon, 9 Sep 2013 19:40:03 -0400 Subject: [PATCH 10/77] Clarified docs on using clipping options --- .../gatk/walkers/readutils/ClipReads.java | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java index dfc36954b..78029eb85 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java @@ -57,36 +57,34 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences. + * Read clipping based on quality, position or sequence matching * + *

This tool provides simple, powerful read clipping capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.

* - *

- * It allows the user to clip bases in reads with poor quality scores, that match particular - * sequences, or that were generated by particular machine cycles. + *

There are three options for clipping (quality, position and sequence), which can be used alone or in combination. In addition, you can also specify a clipping representation, which determines exactly how ClipReads applies clips to the reads (soft clips, writing Q0 base quality scores, etc.). Please note that you MUST specify at least one of the three clipping options, and specifying a clipping representation is not sufficient. If you do not specify a clipping option, the program will run but it will not do anything to your reads.

* *
*
Quality score based clipping
*
* Clip bases from the read in clipper from - *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
- * to the end of the read. This is blatantly stolen from BWA. + *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
+ * to the end of the read. This is copied from BWA. * * Walk through the read from the end (in machine cycle order) to the beginning, calculating the * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the * clipping index in the read (from the end). - *
+ *
*
Cycle based clipping
*
Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions). * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12. - *
+ *
*
Sequence matching
*
Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm, * filtering only bases whose sequence exactly matches SEQ.
*
* - *

* *

Input

*

@@ -99,7 +97,7 @@ import java.util.regex.Pattern; * operation applied to each read. *

*

- *

Summary output

+ *

Summary output (console)

*
  *     Number of examined reads              13
  *     Number of clipped reads               13
@@ -113,16 +111,29 @@ import java.util.regex.Pattern;
  *     
*

* - *

- *

Example clipping

- * Suppose we are given this read: + *

Example

+ *
+ *   java -jar GenomeAnalysisTK.jar \
+ *     -T ClipReads \
+ *     -R reference.fasta \
+ *     -I original.bam \
+ *     -o clipped.bam \
+ *     -XF seqsToClip.fasta \
+ *     -X CCCCC \
+ *     -CT "1-5,11-15" \
+ *     -QT 10
+ * 
+ *

The command line shown above will apply all three options in combination. See the detailed examples below to see how the choice of clipping representation affects the output.

+ * + *

Detailed clipping examples

+ *

Suppose we are given this read:

*
  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
  *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
  *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
  *     
* - * If we are clipping reads with -QT 10 and -CR WRITE_NS, we get: + *

If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:

* *
  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
@@ -130,26 +141,20 @@ import java.util.regex.Pattern;
  *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
  *     
* - * Whereas with -CR WRITE_Q0S: + *

Whereas with -QT 10 -CR WRITE_Q0S:

*
  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
  *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
  *          !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
  *     
* - * Or -CR SOFTCLIP_BASES: + *

Or -QT 10 -CR SOFTCLIP_BASES:

*
  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3133    29      17S59M  *       *       *
  *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
  *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
  *     
- *

* - *

Examples

- *
- *     -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
- *     -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
- * 
* @author Mark DePristo * @since 2010 @@ -158,10 +163,9 @@ import java.util.regex.Pattern; @Requires({DataSource.READS}) public class ClipReads extends ReadWalker { /** - * If provided, ClipReads will write summary statistics about the clipping operations applied - * to the reads to this file. + * If provided, ClipReads will write summary statistics about the clipping operations applied to the reads in this file. */ - @Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false, defaultToStdout = false) + @Output(fullName = "outputStatistics", shortName = "os", doc = "File to output statistics", required = false, defaultToStdout = false) PrintStream out = null; /** From 2f5064dd1d1e1d8e5b8071bfca3c29b4cc174df1 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Mon, 12 Aug 2013 14:36:57 -0400 Subject: [PATCH 11/77] Provide close methods to clean up resources used while creating AlignmentContexts from BAM file regions. Allows utilization of CoveredLocusView via the API Signed-off-by: David Roazen --- .../sting/gatk/datasources/reads/BAMScheduler.java | 8 ++++++++ .../sting/gatk/datasources/reads/IntervalSharder.java | 3 +++ .../sting/gatk/datasources/reads/SAMDataSource.java | 8 ++++++++ .../sting/gatk/datasources/reads/ShardBalancer.java | 3 +++ 4 files changed, 22 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index adb668ff9..2f03edb68 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -246,6 +246,14 @@ public class BAMScheduler implements Iterator { */ private PeekableIterator bamScheduleIterator = null; + /** + * Clean up underlying BAMSchedule file handles. + */ + public void close() { + if(bamScheduleIterator != null) + bamScheduleIterator.close(); + } + /** * Get the next overlapping tree of bins associated with the given BAM file. * @param currentLocus The actual locus for which to check overlap. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index 048ce17f5..b476945ce 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -62,6 +62,9 @@ public class IntervalSharder implements Iterator { wrappedIterator = new PeekableIterator(scheduler); this.parser = parser; } + public void close() { + wrappedIterator.close(); + } public boolean hasNext() { return wrappedIterator.hasNext(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index ac2ed4a4c..9dc9734a5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -352,6 +352,14 @@ public class SAMDataSource { resourcePool.releaseReaders(readers); } + public void close() { + SAMReaders readers = resourcePool.getAvailableReaders(); + for(SAMReaderID readerID: readerIDs) { + SAMFileReader reader = readers.getReader(readerID); + reader.close(); + } + } + /** * Returns Reads data structure containing information about the reads data sources placed in this pool as well as * information about how they are downsampled, sorted, and filtered diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java index ff0fa1127..37f1bcfac 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java @@ -43,4 +43,7 @@ public abstract class ShardBalancer implements Iterable { this.filePointers = new PeekableIterator(filePointers); this.parser = parser; } + public void close() { + this.filePointers.close(); + } } From 74639463b948c660f232fd53cf6728881ad717b1 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 19 Sep 2013 23:07:20 -0400 Subject: [PATCH 12/77] Updating excessive coverage default parameter most people don't care about excessive coverage (unless you're very particular about your analysis). Therefore the best possible default value for this is Integer.maxValue so it doesn't get in the way. Itemized Changes: * change maximumCoverage threshold to Integer.maxValue [delivers #57353620] --- .../gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java index a6cbc1da3..c330b3f02 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java @@ -77,7 +77,7 @@ final class ThresHolder { * If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE */ @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false) - public int maximumCoverage = 700; + public int maximumCoverage = Integer.MAX_VALUE; /** * If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE From 5e2ffc74fccf6c6485ddaab1bdb7c9a6007cccb7 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 20 Sep 2013 16:47:12 -0400 Subject: [PATCH 13/77] Automated interpretation for QualifyMissingIntervals * add a new column to do what I have been doing manually for every project, understand why we got no usable coverage in that interval * add unit tests -- this tool is now public, we need tests. * slightly better docs -- in an effort to produce better docs for this tool --- .../walkers/diagnostics/missing/Metrics.java | 35 +++---- .../missing/QualifyMissingIntervals.java | 94 +++++++++++++++++- .../QualifyMissingIntervalsUnitTest.java | 95 +++++++++++++++++++ 3 files changed, 199 insertions(+), 25 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java index 9296cc89b..63c35fd65 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java @@ -47,29 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.missing; /** - * Short one line description of the walker. - *

- *

- * [Long description of the walker] - *

- *

- *

- *

Input

- *

- * [Description of the Input] - *

- *

- *

Output

- *

- * [Description of the Output] - *

- *

- *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T [walker name]
- *  
+ * Metrics class for the QualifyMissingInterval walker * * @author Mauricio Carneiro * @since 5/1/13 @@ -81,6 +59,8 @@ final class Metrics { private int reads; private int refs; + public Metrics() {} + void reads(int reads) {this.reads = reads;} void refs(int refs) {this.refs = refs;} @@ -108,4 +88,13 @@ final class Metrics { return this; } + + // Test related constructor and methods + protected Metrics(double gccontent, double baseQual, double mapQual, int reads, int refs) { + this.gccontent = gccontent; + this.baseQual = baseQual; + this.mapQual = mapQual; + this.reads = reads; + this.refs = refs; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java index eabcf20c1..8cc7bb8f3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java @@ -117,6 +117,56 @@ public final class QualifyMissingIntervals extends LocusWalker @Argument(shortName = "cds", required = false) public String cdsFile = null; + /** + * This value will be used to determine whether or not an interval had too high or too low GC content to be + * sequenced. This is only applied if there was not enough data in the interval. + */ + @Argument(doc = "upper and lower bound for an interval to be considered high/low GC content", + shortName = "gc", required = false) + public double gcThreshold = 0.3; + + /** + * The coverage of a missing interval may determine whether or not an interval is sequenceable. A low coverage will + * trigger gc content, mapping, base qualities and other checks to figure out why this interval was deemed + * unsequenceable. + */ + @Argument(doc = "minimum coverage to be considered sequenceable", + shortName = "cov", required = false) + public int coverageThreshold = 20; + + /** + * An average mapping quality above this value will determine the interval to be mappable. + */ + @Argument(doc = "minimum mapping quality for it to be considered usable", + shortName = "mmq", required = false) + public byte mappingThreshold = 20; + + /** + * An average base quality above this value will rule out the possibility of context specific problems with the + * sequencer. + */ + @Argument(doc = "minimum base quality for it to be considered usable", + shortName = "mbq", required = false) + public byte qualThreshold = 20; + + /** + * Intervals that are too small generate biased analysis. For example an interval of size 1 will have GC content + * 1 or 0. To avoid misinterpreting small intervals, all intervals below this threshold will be ignored in the + * interpretation. + */ + @Argument(doc = "minimum interval length to be considered", + shortName = "size", required = false) + public byte intervalSizeThreshold = 10; + + enum Interpretation { + UNKNOWN, + UNMAPPABLE, + UNSEQUENCEABLE, + GCCONTENT, + NO_DATA, + SMALL_INTERVAL + } + GATKReport simpleReport; GenomeLocSortedSet target; GenomeLocSortedSet cds; @@ -130,7 +180,7 @@ public final class QualifyMissingIntervals extends LocusWalker if (cdsFile == null) cdsFile = targetsFile; - simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "DP", "TP", "CD", "LN"); + simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "DP", "TP", "CD", "LN", "DS"); final GenomeLocParser parser = getToolkit().getGenomeLocParser(); target = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, targetsFile)); cds = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, cdsFile)); @@ -184,7 +234,8 @@ public final class QualifyMissingIntervals extends LocusWalker metrics.depth(), getPositionInTarget(interval), cds.overlaps(interval), - interval.size() + interval.size(), + interpret(metrics, interval) ); } simpleReport.print(out); @@ -199,4 +250,43 @@ public final class QualifyMissingIntervals extends LocusWalker } return result; } + + String interpret(final Metrics metrics, final GenomeLoc interval) { + if (interval.size() < intervalSizeThreshold) { + return Interpretation.SMALL_INTERVAL.toString(); + } + else if (metrics.depth() == 0.0) { + return Interpretation.NO_DATA.toString(); + } + return trim(checkMappability(metrics) + checkGCContent(metrics) + checkContext(metrics)); + } + + String checkMappability(Metrics metrics) { + return metrics.depth() >= coverageThreshold && metrics.mapQual() < mappingThreshold ? + Interpretation.UNMAPPABLE + ", " : ""; + } + + String checkGCContent(Metrics metrics) { + return metrics.depth() < coverageThreshold && (metrics.gccontent() < gcThreshold || metrics.gccontent() > 1.0-gcThreshold) ? + Interpretation.GCCONTENT + ", " : ""; + } + + String checkContext(Metrics metrics) { + return metrics.depth() < coverageThreshold && metrics.baseQual() < qualThreshold ? + Interpretation.UNSEQUENCEABLE + ", " : ""; + } + + String trim (String s) { + if (s.isEmpty()) + return Interpretation.UNKNOWN.toString(); + + s = s.trim(); + if (s.endsWith(",")) + s = s.substring(0, s.length() - 1); + return s; + } + + + + } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java new file mode 100644 index 000000000..7d6d05736 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java @@ -0,0 +1,95 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.missing; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Created with IntelliJ IDEA. + * User: carneiro + * Date: 9/20/13 + * Time: 3:59 PM + * To change this template use File | Settings | File Templates. + */ +public class QualifyMissingIntervalsUnitTest extends BaseTest { + @Test(enabled = true) + public void testInterpretation() { + final QualifyMissingIntervals tool = new QualifyMissingIntervals(); + + final Metrics unmappable = new Metrics(0.5, 7500.0, 0.0, 2500, 20); + final Metrics highGC = new Metrics(0.99, 0.0, 0.0, 0, 20); + final Metrics lowGC = new Metrics(0.09, 0.0, 0.0, 0, 20); + final Metrics unsequenceable = new Metrics(0.5, 3.0, 1200.0, 10, 20); + final Metrics noData = new Metrics(0.5, 0.0, 0.0, 0, 20); + final Metrics unknown = new Metrics(0.5, 30.0, 120000.0, 2500, 20); + + final Metrics[] array = {unmappable, highGC, lowGC, unsequenceable, noData, unknown}; + + final GenomeLoc testInterval = new UnvalidatingGenomeLoc("chr1", 0, 10000, 20000); + final GenomeLoc smallInterval = new UnvalidatingGenomeLoc("chr1", 0, 1, 4); + + + Assert.assertNotEquals(tool.checkMappability(unmappable), ""); + Assert.assertNotEquals(tool.checkGCContent(highGC), ""); + Assert.assertNotEquals(tool.checkGCContent(lowGC), ""); + Assert.assertNotEquals(tool.checkContext(unsequenceable), ""); + + Assert.assertEquals(tool.interpret(unmappable, testInterval), QualifyMissingIntervals.Interpretation.UNMAPPABLE.toString()); + Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString()); + Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString()); + Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString()); + Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString()); + Assert.assertEquals(tool.interpret(unknown, testInterval), QualifyMissingIntervals.Interpretation.UNKNOWN.toString()); + + for (Metrics m : array) + Assert.assertEquals(tool.interpret(m, smallInterval), QualifyMissingIntervals.Interpretation.SMALL_INTERVAL.toString()); + } +} From 5bbad75402424c80789aef5496ea5004c5d26c93 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 20 Sep 2013 18:54:01 -0400 Subject: [PATCH 14/77] Changing max coverage threshold Because Integer.maxValue is not unit testable --- .../gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java index c330b3f02..b088951e5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java @@ -77,7 +77,7 @@ final class ThresHolder { * If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE */ @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false) - public int maximumCoverage = Integer.MAX_VALUE; + public int maximumCoverage = Integer.MAX_VALUE / 2; /** * If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE From b32ad99d3f63d1341ff7429f9bb203367e8937fd Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Tue, 27 Aug 2013 15:56:30 -0400 Subject: [PATCH 15/77] Changing from scala 2.9.2 to 2.10.2. --modified ivy dependencies --modified scala classpath in build.xml to include scala-reflect --changed imports to point to the new scala scala.reflect.internal.util --set the bootclasspath in QScriptManager as well as the classpath variable. --removing Set[File] <-> Set[String] conversions ----Set is invariant now and the conversions broke --removing unit tests for Set[File] <-> Set[String] conversions --- build.xml | 1 + ivy.xml | 4 +- .../sting/queue/QScriptManager.scala | 8 +- .../queue/util/StringFileConversions.scala | 22 ------ .../util/StringFileConversionsUnitTest.scala | 75 ------------------- 5 files changed, 8 insertions(+), 102 deletions(-) diff --git a/build.xml b/build.xml index 0844717dd..47472d33e 100644 --- a/build.xml +++ b/build.xml @@ -526,6 +526,7 @@ + diff --git a/ivy.xml b/ivy.xml index ed13af1c2..2e45247ab 100644 --- a/ivy.xml +++ b/ivy.xml @@ -82,8 +82,8 @@ - - + + diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala index 37c4a5bbe..c6b8eff13 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala @@ -33,10 +33,9 @@ import java.io.File import scala.tools.nsc.reporters.AbstractReporter import java.lang.String import org.apache.log4j.Level -import scala.tools.nsc.util.{FakePos, NoPosition, Position} import org.broadinstitute.sting.queue.util.TextFormatUtils._ import org.broadinstitute.sting.utils.classloader.JVMUtils -import tools.util.StringOps +import scala.reflect.internal.util.{FakePos, NoPosition, Position, StringOps} /** * Plugin manager for QScripts which loads QScripts into the current class loader. @@ -53,7 +52,10 @@ class QScriptManager() extends Logging { settings.outdir.value = tempDir.getPath // Set the classpath to the current class path. - JVMUtils.getClasspathURLs.foreach(url => settings.classpath.append(url.getPath)) + JVMUtils.getClasspathURLs.foreach(url => { + settings.bootclasspath.append(url.getPath) + settings.classpath.append(url.getPath) + }) val reporter = new QScriptManager.Log4JReporter(settings) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala b/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala index 35f872848..8e8a87abe 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala @@ -77,21 +77,6 @@ object StringFileConversions { }) } - implicit def stringsAsFiles(x: Set[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable] with Serializable]): Set[File] = { - x.map(_ match { - case string: String => stringAsFile(string) - case file: File => file - case null => null - }) - } - - implicit def filesAsStrings(x: Set[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable] with Serializable]): Set[String] = { - x.map(_ match { - case file: File => fileAsString(file) - case string: String => string - case null => null - }) - } } /** @@ -124,11 +109,4 @@ trait StringFileConversions { StringFileConversions.filesAsStringsList(x) } - implicit def stringsAsFiles(x: Set[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable] with Serializable]): Set[File] = { - StringFileConversions.stringsAsFiles(x) - } - - implicit def filesAsStrings(x: Set[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable] with Serializable]): Set[String] = { - StringFileConversions.filesAsStrings(x) - } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala index 22a7a8a04..5ee02b8bc 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala @@ -87,44 +87,6 @@ class StringFileConversionsUnitTest { Assert.assertEquals(strings, Seq(null, null)) } - @Test - def testStringToFileSet() { - var files = Set(new File("foo")) - files += "bar" - Assert.assertEquals(files, Set(new File("foo"), new File("bar"))) - - files = Set(new File("foo")) - files += null.asInstanceOf[String] - Assert.assertEquals(files, Set(new File("foo"), null)) - - files = Set[File](null) - files += "foo" - Assert.assertEquals(files, Set(new File("foo"), null)) - - files = Set[File](null) - files += null.asInstanceOf[String] - Assert.assertEquals(files, Set(null)) - } - - @Test - def testFileToStringSet() { - var strings = Set("foo") - strings += new File("bar") - Assert.assertEquals(strings, Set("foo", "bar")) - - strings = Set("foo") - strings += null.asInstanceOf[File] - Assert.assertEquals(strings, Set("foo", null)) - - strings = Set[String](null) - strings += new File("foo") - Assert.assertEquals(strings, Set("foo", null)) - - strings = Set[String](null) - strings += null.asInstanceOf[File] - Assert.assertEquals(strings, Set(null)) - } - @Test def testStringListToFileList() { var files = Seq(new File("foo")) @@ -163,41 +125,4 @@ class StringFileConversionsUnitTest { Assert.assertEquals(strings, Seq(null, null)) } - @Test - def testStringSetToFileSet() { - var files = Set(new File("foo")) - files ++= Set("bar") - Assert.assertEquals(files, Set(new File("foo"), new File("bar"))) - - files = Set(new File("foo")) - files ++= Set[String](null) - Assert.assertEquals(files, Set(new File("foo"), null)) - - files = Set[File](null) - files ++= Set("foo") - Assert.assertEquals(files, Set(new File("foo"), null)) - - files = Set[File](null) - files ++= Set[String](null) - Assert.assertEquals(files, Set(null)) - } - - @Test - def testFileSetToStringSet() { - var strings = Set("foo") - strings ++= Set(new File("bar")) - Assert.assertEquals(strings, Set("foo", "bar")) - - strings = Set("foo") - strings ++= Set[File](null) - Assert.assertEquals(strings, Set("foo", null)) - - strings = Set[String](null) - strings ++= Set(new File("foo")) - Assert.assertEquals(strings, Set("foo", null)) - - strings = Set[String](null) - strings ++= Set[File](null) - Assert.assertEquals(strings, Set(null)) - } } From c05208ecec01bdc969d2b2a0bb547dad9999b77e Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Fri, 30 Aug 2013 16:43:21 -0400 Subject: [PATCH 16/77] Resolving warnings --specifying exception types in cases where none was already specified ----mostly changed to catch Exception instead of Throwable ----EmailMessage has a point where it should only be expecting a RetryException but was catching everything --changing build.xml so that it prints scala feature warning details --added necessary imports needed to remove feature warnings --updating a newly deprecated enum declaration to match the new syntax --- build.xml | 4 ++-- .../sting/queue/engine/drmaa/DrmaaJobRunner.scala | 2 +- .../sting/queue/engine/lsf/Lsf706JobRunner.scala | 2 +- .../sting/queue/engine/shell/ShellJobRunner.scala | 4 ++-- .../org/broadinstitute/sting/queue/function/QFunction.scala | 1 + .../sting/queue/library/ipf/vcf/VCFExtractSamples.scala | 2 +- .../org/broadinstitute/sting/queue/util/EmailMessage.scala | 4 ++-- .../sting/queue/util/PrimitiveOptionConversions.scala | 2 ++ .../scala/src/org/broadinstitute/sting/queue/util/Retry.scala | 2 +- .../sting/queue/util/StringFileConversions.scala | 1 + .../broadinstitute/sting/queue/pipeline/PipelineTest.scala | 4 ++-- 11 files changed, 16 insertions(+), 12 deletions(-) diff --git a/build.xml b/build.xml index 47472d33e..16cfa9a61 100644 --- a/build.xml +++ b/build.xml @@ -538,7 +538,7 @@ Building Scala... - + @@ -1219,7 +1219,7 @@ - + diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala index 9cfd69247..79fc8589f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala @@ -160,7 +160,7 @@ class DrmaaJobRunner(val session: Session, val function: CommandLineFunction) ex // resource of the designated queue to SIGTERM session.control(jobId, Session.TERMINATE) } catch { - case e => + case e: Exception => logger.error("Unable to kill job " + jobId, e) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index 1140c4945..ead29bbf5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -361,7 +361,7 @@ object Lsf706JobRunner extends Logging { if (LibBat.lsb_signaljob(runner.jobId, SIGTERM) < 0) logger.error(LibBat.lsb_sperror("Unable to kill job " + runner.jobId)) } catch { - case e => + case e: Exception=> logger.error("Unable to kill job " + runner.jobId, e) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala index 13b3c7cb3..e3528f54f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala @@ -83,8 +83,8 @@ class ShellJobRunner(val function: CommandLineFunction) extends CommandLineJobRu try { controller.tryDestroy() } catch { - case e => - logger.error("Unable to kill shell job: " + function.description) + case e: Exception => + logger.error("Unable to kill shell job: " + function.description, e) } } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index 81c76dd29..abbb63271 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -32,6 +32,7 @@ import org.broadinstitute.sting.queue.{QException, QSettings} import java.lang.IllegalStateException import org.broadinstitute.sting.queue.util._ import org.broadinstitute.sting.utils.io.IOUtils +import scala.language.reflectiveCalls /** * The base interface for all functions in Queue. diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala index 6dcc69854..ddff95f21 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala @@ -39,7 +39,7 @@ class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extend @Argument(doc="The samples to extract from the VCF") var extractSamples : List[String] = samples var out : PrintWriter = _ - var columns : List[Int] = 0 to 8 toList + var columns : List[Int] = (0 to 8).toList def run = { out = new PrintWriter(new PrintStream(outputVCF)) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala b/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala index 4d3bf719c..96a5973be 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala @@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.util import org.apache.commons.mail.{MultiPartEmail, EmailAttachment} -import java.io.{FileReader, File} +import java.io.{IOException, FileReader, File} import javax.mail.internet.InternetAddress import scala.collection.JavaConversions._ @@ -105,7 +105,7 @@ class EmailMessage extends Logging { try { Retry.attempt(() => send(settings), .5) } catch { - case e => logger.error("Error sending message: %n%s".format(this.toString), e) + case e: RetryException=> logger.error("Error sending message: %n%s".format(this.toString), e) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala b/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala index 4acd27497..cb7b95b76 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala @@ -25,6 +25,8 @@ package org.broadinstitute.sting.queue.util +import scala.language.implicitConversions + /** * An importable object that provides automatic primitive to option conversion. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala b/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala index b112ed9a3..5b9e42a1e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala @@ -49,7 +49,7 @@ object Retry extends Logging { result = f() success = true } catch { - case e => { + case e: Exception=> { count += 1 if (count < tries) { val minutes = wait(count-1) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala b/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala index 8e8a87abe..ff99cb346 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala @@ -26,6 +26,7 @@ package org.broadinstitute.sting.queue.util import java.io.{Serializable, File} +import scala.language.implicitConversions /** * Converts String to/from File diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index e9a288117..251b1c511 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -182,7 +182,7 @@ object PipelineTest extends BaseTest with Logging { println("Executing test %s with Queue arguments: %s".format(name, Utils.join(" ",command))) CommandLineProgram.start(instance, command) } catch { - case e => + case e: Exception => gotAnException = true if (expectedException != null) { // we expect an exception @@ -224,7 +224,7 @@ object PipelineTest extends BaseTest with Logging { try { commandLine.shutdown() } catch { - case _ => /* ignore */ + case _: Throwable => /* ignore */ }) } }) From d6992d12632c0cfc6b3bdf5b88872fb2d4625e41 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 23 Sep 2013 15:48:47 -0400 Subject: [PATCH 17/77] Updated docs to tell users not to use PCR indel error modeling for PCR free data. --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 1 + 1 file changed, 1 insertion(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 8776a5e4b..360d0979f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -475,6 +475,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In /** * Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base * insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator). + * VERY IMPORTANT: when using PCR-free sequencing data we definitely recommend setting this argument to NONE. */ @Advanced @Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false) From 2783c84c6b68b289d31ae16549492db92a8927a4 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 24 Sep 2013 22:32:19 -0400 Subject: [PATCH 18/77] Updated docs for DepthPerSampleHC to deliver PT #54237024. --- .../gatk/walkers/annotator/DepthPerSampleHC.java | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java index 9bd641011..21325e6f1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java @@ -51,7 +51,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -68,11 +67,15 @@ import java.util.*; /** - * The depth of coverage of each allele per sample + * The depth of coverage for informative reads for each sample. * - * the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot - * differentiate between reads that align over the event but aren't informative vs. those that aren't even - * close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP). + * An informative read is defined as one from which the allele it carries can be easily distinguished. An example of a + * case where a read might be uninformative is where it only partially overlaps a short tandem repeat and it is not clear + * whether the read contains the reference allele or e.g. an extra repeat. + * The depth here is the sum of the informative reads at this site as determined by the Haplotype Caller; as such it can + * only be calculated and generated through the Haplotype Caller (it will not work when run through the Variant Annotator). + * This calculation is not perfect but it is a pretty good proxy for depth and it does match the values in the AD field + * (i.e., sum(AD) = DP). */ public class DepthPerSampleHC extends GenotypeAnnotation { public void annotate(final RefMetaDataTracker tracker, @@ -121,6 +124,6 @@ public class DepthPerSampleHC extends GenotypeAnnotation { } public List getDescriptions() { - return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0))); + return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY)); } } \ No newline at end of file From 5113e21437b9a69f99bdea49fa80854d8ab090e7 Mon Sep 17 00:00:00 2001 From: Michael McCowan Date: Mon, 23 Sep 2013 14:26:24 -0400 Subject: [PATCH 20/77] Bug fix: annotation values ar parsed as Doubles when they should be parsed as Integers due to implicit conversion. * Updated expected test data in which an integer annotation (MQ0) was formatted as a double. --- .../variantutils/CombineVariantsIntegrationTest.java | 2 +- .../sting/utils/variant/GATKVariantContextUtils.java | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 66bc74caa..2eeb9221e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -194,7 +194,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " --excludeNonVariants -combineAnnotations -setKey null" + " -L 20:10,000,000-10,001,000", b37KGReference), 1, - Arrays.asList("2e15db35359144683f1e58e147362679")); + Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a")); cvExecuteTest("combineSingleSamplePipelineGVCF", spec, true); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index e8c438a53..11cd27a9f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -891,7 +891,11 @@ public class GATKVariantContextUtils { } try { final String stringValue = value.toString(); - values.add(stringValue.contains(".") ? Double.parseDouble(stringValue) : Integer.parseInt(stringValue)); + // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. + if (stringValue.contains(".")) + values.add(Double.parseDouble(stringValue)); + else + values.add(Integer.parseInt(stringValue)); } catch (NumberFormatException e) { badAnnotation = true; } From 27808d336af0745500f050b4f8150eb2c9e4b3d2 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Thu, 26 Sep 2013 13:13:53 -0400 Subject: [PATCH 21/77] Minor clarifications regarding ignoreFilter argument --- .../walkers/variantrecalibration/ApplyRecalibration.java | 5 ++++- .../walkers/variantrecalibration/VariantRecalibrator.java | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 3ae68edab..314efe2a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -136,7 +136,10 @@ public class ApplyRecalibration extends RodWalker implements T ///////////////////////////// @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) protected double TS_FILTER_LEVEL = 99.0; - @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false) + /** + * For this to work properly, the -ignoreFilter argument should also be applied to the VariantRecalibration command. + */ + @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified, the recalibration will be applied to variants marked as filtered by the specified filter name in the input VCF file", required=false) private String[] IGNORE_INPUT_FILTERS = null; @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't output filtered loci after applying the recalibration", required=false) protected boolean EXCLUDE_FILTERED = false; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 5a8debc72..1ee02d10d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -201,7 +201,10 @@ public class VariantRecalibrator extends RodWalker Date: Thu, 26 Sep 2013 14:28:22 -0400 Subject: [PATCH 22/77] Minor clarifications & formatting tweaks for dcov docs --- .../arguments/GATKArgumentCollection.java | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 174e434fe..e1620c938 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -119,21 +119,29 @@ public class GATKArgumentCollection { // Downsampling Arguments // // -------------------------------------------------------------------------------------------------------------- - @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false) + /** + * Reads will be selected randomly to be removed from the pile based on the method described here. + */ + @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus", required = false) public DownsampleType downsamplingType = null; @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; + /** + * For locus-based traversals (eg., LocusWalkers and ActiveRegionWalkers), downsample_to_coverage controls the + * maximum depth of coverage at each locus. For non-locus-based traversals (eg., ReadWalkers), this controls the + * maximum number of reads sharing the same alignment start position. Note that for ReadWalkers, since -dcov controls + * the maximum number of reads sharing the same alignment start position, you will typically need to use much lower + * dcov values than you would with LocusWalkers to see an effect. Note also that this downsampling option does NOT + * produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the + * to-coverage downsampler is to maintain an even representation of reads from all alignment start positions when + * removing excess coverage. For a true across-the-board unbiased random sampling of reads, use -dfrac instead. Also + * note that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling + * algorithm will under some circumstances retain slightly more coverage than requested. + */ @Argument(fullName = "downsample_to_coverage", shortName = "dcov", - doc = "Coverage [integer] to downsample to. For locus-based traversals (eg., LocusWalkers and ActiveRegionWalkers)," + - "this controls the maximum depth of coverage at each locus. For non-locus-based traversals (eg., ReadWalkers), " + - "this controls the maximum number of reads sharing the same alignment start position. Note that this downsampling " + - "option does NOT produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of " + - "the to-coverage downsampler is to maintain an even representation of reads from all alignment start positions " + - "when removing excess coverage. For a true across-the-board unbiased random sampling of reads, use -dfrac instead. " + - "Also note that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling " + - "algorithm will under some circumstances retain slightly more coverage than requested.", + doc = "Coverage [integer] to downsample to", required = false) public Integer downsampleCoverage = null; From 511948890a4b72f7a9a6c33579f2c66555c46ae6 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Thu, 26 Sep 2013 14:50:32 -0400 Subject: [PATCH 23/77] Modify gatkdoc template to handle downsampling info better --- settings/helpTemplates/generic.template.html | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index 098d294cb..b78bba48c 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -174,11 +174,16 @@ <#if downsampling?size != 0>

Downsampling settings

-

This tool overrides the engine's default downsampling settings.

-
    -
  • Mode: ${downsampling.by}
  • -
  • To coverage: ${downsampling.to_cov}
  • -
+ <#if downsampling.by == "NONE"> +

This tool does not apply any downsampling by default.

+ + <#if downsampling.by != "NONE"> +

This tool applies the following downsampling settings by default.

+
    +
  • Mode: ${downsampling.by}
  • +
  • To coverage: ${downsampling.to_cov}
  • +
+ <#if refwindow?size != 0>

Window size

From ef1d58b7ff42d8fb340bb16487f127d0f8b1c26f Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 27 Sep 2013 11:24:35 -0400 Subject: [PATCH 24/77] Bugfix for hom ref records that aren't GVCF blocks. --- .../genotyper/UnifiedGenotyperEngine.java | 2 +- .../sting/utils/gvcf/GVCFWriter.java | 5 +++- .../sting/utils/gvcf/GVCFWriterUnitTest.java | 29 +++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index ec31e1f2f..5c6e9dc01 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -374,7 +374,7 @@ public class UnifiedGenotyperEngine { final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false,perReadAlleleLikelihoodMap); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap); } /** diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java index 8ee3c166c..98aedf786 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java @@ -46,6 +46,9 @@ package org.broadinstitute.sting.utils.gvcf; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypeBuilder; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -284,7 +287,7 @@ public class GVCFWriter implements VariantContextWriter { } final Genotype g = vc.getGenotype(0); - if ( g.isHomRef() ) { + if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) { // create bands final VariantContext maybeCompletedBand = addHomRefSite(vc, g); if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand); diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java index e353739e5..5c14c490e 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java @@ -129,6 +129,16 @@ public class GVCFWriterUnitTest extends BaseTest { return vcb.genotypes(gb.make()).make(); } + private VariantContext makeHomRefAlt(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + private VariantContext makeNonRef(final String contig, final int start, final int GQ) { final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, ALT)); @@ -305,6 +315,25 @@ public class GVCFWriterUnitTest extends BaseTest { assertGoodVC(mockWriter.emitted.get(2), "20", 6, 7, false); } + @Test + public void testHomRefAlt() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeHomRefAlt("20", 3, 0)); + writer.add(makeHomRef("20", 4, 0)); + writer.add(makeHomRef("20", 5, 0)); + writer.add(makeHomRef("20", 6, 0)); + writer.add(makeHomRef("20", 7, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("END")); + Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("BLOCK_SIZE")); + assertGoodVC(mockWriter.emitted.get(2), "20", 4, 7, false); + } + @DataProvider(name = "BandPartitionData") public Object[][] makeBandPartitionData() { List tests = new ArrayList<>(); From 9f7fa247f67e3cd33f26c3e1cf2848c35bd479e3 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Thu, 26 Sep 2013 16:04:08 -0400 Subject: [PATCH 25/77] Disable VQSR tranche plots in INDEL mode --- .../VariantRecalibrator.java | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 1ee02d10d..c3f575022 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -181,7 +181,7 @@ public class VariantRecalibrator extends RodWalker randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) { From 839b918f58fdc31a03d59d40f161ddbe30c3d4e7 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 9 Sep 2013 16:10:39 -0400 Subject: [PATCH 26/77] Length metric updates to QualifyMissingIntervals * add a length of the overlaping interval metric as per CSER request * standardized the distance metrics to be positive when fully overlapping and the longest off-target tail (as a negative number) when not overlapping * add gatkdocs to the tool (finally!) --- .../missing/QualifyMissingIntervals.java | 64 +++++++++++++++---- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java index 8cc7bb8f3..9fabd6a37 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java @@ -76,10 +76,12 @@ import java.util.List; *
    *
  • Average Base Quality
  • *
  • Average Mapping Quality
  • + *
  • Average Depth
  • *
  • GC Content
  • - *
  • Position in the target
  • - *
  • Coding Sequence / Intron
  • - *
  • Length of the uncovered area
  • + *
  • Position in the target (Integer.MIN_VALUE if no overlap)
  • + *
  • Length of the overlapping target (zero if no overlap)
  • + *
  • Coding Sequence / Intron (optional)
  • + *
  • Length of the uncovered interval
  • *
* *

Input

@@ -89,7 +91,7 @@ import java.util.List; * *

Output

*

- * GC content calculations per interval. + * GC content, distance from the end of the target, coding sequence intersection, mapping and base quality averages and average depth per "missing" interval. *

* *

Example

@@ -108,12 +110,24 @@ import java.util.List; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @By(DataSource.REFERENCE) public final class QualifyMissingIntervals extends LocusWalker implements NanoSchedulable { + /** + * A single GATKReport table with the qualifications on why the intervals passed by the -L argument were missing. + */ @Output protected PrintStream out; + /** + * List of targets used in the experiment. This file will be used to calculate the distance your missing + * intervals are to the targets (usually exons). Typically this is your hybrid selection targets file + * (e.g. Agilent exome target list) + */ @Argument(shortName = "targets", required = true) public String targetsFile; + /** + * List of coding sequence intervals (exons) if different from the targets file, to distinguish intervals + * that overlap the cds and intervals that don't. + */ @Argument(shortName = "cds", required = false) public String cdsFile = null; @@ -224,15 +238,18 @@ public final class QualifyMissingIntervals extends LocusWalker public void onTraversalDone(List> results) { for (Pair r : results) { - GenomeLoc interval = r.getFirst(); - Metrics metrics = r.getSecond(); + final GenomeLoc interval = r.getFirst(); + final Metrics metrics = r.getSecond(); + final List overlappingIntervals = target.getOverlapping(interval); + simpleReport.addRow( interval.toString(), metrics.gccontent(), metrics.baseQual(), metrics.mapQual(), metrics.depth(), - getPositionInTarget(interval), + getPositionInTarget(interval, overlappingIntervals), + getTargetSize(overlappingIntervals), cds.overlaps(interval), interval.size(), interpret(metrics, interval) @@ -242,13 +259,34 @@ public final class QualifyMissingIntervals extends LocusWalker out.close(); } - private int getPositionInTarget(GenomeLoc interval) { - final List hits = target.getOverlapping(interval); - int result = 0; - for (GenomeLoc hit : hits) { - result = interval.getStart() - hit.getStart(); // if there are multiple hits, we'll get the last one. + private int getPositionInTarget(final GenomeLoc interval, final List hits) { + if (hits.size() > 0) { + final GenomeLoc hit = hits.get(0); + + // interval is larger on both ends than the target -- return the maximum distance to either side as a negative number. (min of 2 negative numbers) + if (interval.getStart() < hit.getStart() && interval.getStop() > hit.getStop()) + return Math.min(interval.getStart() - hit.getStart(), + interval.getStop() - hit.getStop()); + + // interval is a left overlap -- return a negative number representing the distance between the two starts + else if (interval.getStart() < hit.getStart()) + return hit.getStart() - interval.getStart(); + + // interval is a right overlap -- return a negative number representing the distance between the two stops + else if (interval.getStop() > hit.getStop()) + return hit.getStop() - interval.getStop(); + + // interval is fully contained -- return the smallest distance to the edge of the target (left or right) as a positive number + else + return Math.min(Math.abs(hit.getStart() - interval.getStart()), + Math.abs(hit.getStop() - interval.getStop())); } - return result; + // if there is no overlapping interval, return int min value. + return Integer.MIN_VALUE; + } + + private int getTargetSize(final List overlappingIntervals) { + return overlappingIntervals.size() > 0 ? overlappingIntervals.get(0).size() : -1; } String interpret(final Metrics metrics, final GenomeLoc interval) { From 63ace685c90d4973c6cb144095db8502e8f602e9 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 10 Sep 2013 10:59:27 -0400 Subject: [PATCH 27/77] add unit tests --- .../missing/QualifyMissingIntervals.java | 23 +++++------ .../QualifyMissingIntervalsUnitTest.java | 38 ++++++++++++++++++- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java index 9fabd6a37..014ed6dcb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java @@ -259,27 +259,24 @@ public final class QualifyMissingIntervals extends LocusWalker out.close(); } - private int getPositionInTarget(final GenomeLoc interval, final List hits) { - if (hits.size() > 0) { - final GenomeLoc hit = hits.get(0); + protected static int getPositionInTarget(final GenomeLoc interval, final List targets) { + if (targets.size() > 0) { + final GenomeLoc target = targets.get(0); // interval is larger on both ends than the target -- return the maximum distance to either side as a negative number. (min of 2 negative numbers) - if (interval.getStart() < hit.getStart() && interval.getStop() > hit.getStop()) - return Math.min(interval.getStart() - hit.getStart(), - interval.getStop() - hit.getStop()); + if (interval.getStart() < target.getStart() && interval.getStop() > target.getStop()) + return Math.min(target.getStart() - interval.getStart(), target.getStop() - interval.getStop()); // interval is a left overlap -- return a negative number representing the distance between the two starts - else if (interval.getStart() < hit.getStart()) - return hit.getStart() - interval.getStart(); + else if (interval.getStart() < target.getStart()) + return interval.getStart() - target.getStart(); // interval is a right overlap -- return a negative number representing the distance between the two stops - else if (interval.getStop() > hit.getStop()) - return hit.getStop() - interval.getStop(); + else if (interval.getStop() > target.getStop()) + return target.getStop() - interval.getStop(); // interval is fully contained -- return the smallest distance to the edge of the target (left or right) as a positive number - else - return Math.min(Math.abs(hit.getStart() - interval.getStart()), - Math.abs(hit.getStop() - interval.getStop())); + return Math.min(interval.getStart() - target.getStart(), target.getStop() - interval.getStop()); } // if there is no overlapping interval, return int min value. return Integer.MIN_VALUE; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java index 7d6d05736..7ab891bd0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.missing; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import java.util.List; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; @@ -57,7 +59,6 @@ import org.testng.annotations.Test; * User: carneiro * Date: 9/20/13 * Time: 3:59 PM - * To change this template use File | Settings | File Templates. */ public class QualifyMissingIntervalsUnitTest extends BaseTest { @Test(enabled = true) @@ -92,4 +93,39 @@ public class QualifyMissingIntervalsUnitTest extends BaseTest { for (Metrics m : array) Assert.assertEquals(tool.interpret(m, smallInterval), QualifyMissingIntervals.Interpretation.SMALL_INTERVAL.toString()); } + + @Test(enabled = true) + void testGetPositionInTarget () { + final UnvalidatingGenomeLoc target = new UnvalidatingGenomeLoc("a", 0, 30, 50); + final List targets = new ObjectArrayList<>(1); + targets.add(target); + + // left overlap + UnvalidatingGenomeLoc interval = new UnvalidatingGenomeLoc("a", 0, 10, 50); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -20); + + // right overlap + interval = new UnvalidatingGenomeLoc("a", 0, 40, 60); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -10); + + // interval > target with short right tail + interval = new UnvalidatingGenomeLoc("a", 0, 10, 60); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -10); + + // interval > target with short left tail + interval = new UnvalidatingGenomeLoc("a", 0, 10, 80); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -30); + + // interval < target with short right tail + interval = new UnvalidatingGenomeLoc("a", 0, 32, 40); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), 2); + + // interval < target with short left tail + interval = new UnvalidatingGenomeLoc("a", 0, 40, 42); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), 8); + + // no overlap + interval = new UnvalidatingGenomeLoc("a", 0, 40, 42); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, new ObjectArrayList()), Integer.MIN_VALUE); + } } From 5d6421494b201d903881590a17211a028e42c704 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 9 Oct 2013 14:38:15 -0400 Subject: [PATCH 32/77] Fix mismatching number of columns in report Quick fix the missing column header in the QualifyMissingIntervals report. Adding a QScript for the tool as well as a few minor updates to the GATKReportGatherer. --- .../diagnostics/missing/QualifyMissingIntervals.java | 11 ++++++----- .../sting/gatk/report/GATKReportGatherer.java | 8 +++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java index 014ed6dcb..52a92d9ff 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java @@ -47,16 +47,15 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.missing; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Gather; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.report.GATKReportGatherer; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -109,10 +108,12 @@ import java.util.List; */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @By(DataSource.REFERENCE) +@PartitionBy(PartitionType.INTERVAL) public final class QualifyMissingIntervals extends LocusWalker implements NanoSchedulable { /** * A single GATKReport table with the qualifications on why the intervals passed by the -L argument were missing. */ + @Gather(GATKReportGatherer.class) @Output protected PrintStream out; @@ -194,7 +195,7 @@ public final class QualifyMissingIntervals extends LocusWalker if (cdsFile == null) cdsFile = targetsFile; - simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "DP", "TP", "CD", "LN", "DS"); + simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "DP", "TP", "TS", "CD", "LN", "DS"); final GenomeLocParser parser = getToolkit().getGenomeLocParser(); target = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, targetsFile)); cds = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, cdsFile)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java index e9ccebf34..5e7c3ec86 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java @@ -42,23 +42,21 @@ public class GATKReportGatherer extends Gatherer { try { o = new PrintStream(output); } catch (FileNotFoundException e) { - throw new UserException("File to be output by CoverageByRG Gather function was not found"); + throw new UserException(String.format("File %s to be output by GATKReportGatherer function was not found", output)); } GATKReport current = new GATKReport(); boolean isFirst = true; for (File input : inputs) { - - // If the table is empty if (isFirst) { current = new GATKReport(input); isFirst = false; } else { - GATKReport toAdd = new GATKReport(input); - current.concat(toAdd); + current.concat(new GATKReport(input)); } } current.print(o); + o.close(); } } From efbfdb64fed48c27bd052812ecec483124946945 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Aug 2013 23:52:23 -0400 Subject: [PATCH 33/77] Qscript to Downsample and analyze an exome BAM this script downsamples an exome BAM several times and makes a coverage distribution analysis (of bases that pass filters) as well as haplotype caller calls with a NA12878 Knowledge Base assessment with comparison against multi-sample calling with the UG. This script was used for the "downsampling the exome" presentation --- .../sting/queue/util/QScriptUtils.scala | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala index 7b0e5d1be..b078bcd4f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -79,8 +79,23 @@ object QScriptUtils { if (sample.isEmpty) sample = r.getSample else if (sample != r.getSample) - return true; + return true } false } + + /** + * Returns all distinct samples in the BAM file + * + * @param bam the bam file + * @return a set with all distinct samples (in no particular order) + */ + def getSamplesFromBAM(bam: File) : Set[String] = { + val reader = new SAMFileReader(bam) + var samples: Set[String] = Set() + for (rg <- reader.getFileHeader.getReadGroups) { + samples += rg.getSample + } + samples + } } From 5a2ef37ead2360bb87f4d74feb225cdffae54e1f Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 16 Oct 2013 15:24:33 -0400 Subject: [PATCH 36/77] Tweak dcov documentation to help prevent user confusion Geraldine-approved! --- .../arguments/GATKArgumentCollection.java | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index e1620c938..17849749e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -129,19 +129,18 @@ public class GATKArgumentCollection { public Double downsampleFraction = null; /** - * For locus-based traversals (eg., LocusWalkers and ActiveRegionWalkers), downsample_to_coverage controls the - * maximum depth of coverage at each locus. For non-locus-based traversals (eg., ReadWalkers), this controls the - * maximum number of reads sharing the same alignment start position. Note that for ReadWalkers, since -dcov controls - * the maximum number of reads sharing the same alignment start position, you will typically need to use much lower - * dcov values than you would with LocusWalkers to see an effect. Note also that this downsampling option does NOT - * produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the + * For locus-based traversals (LocusWalkers and ActiveRegionWalkers), downsample_to_coverage controls the + * maximum depth of coverage at each locus. For read-based traversals (ReadWalkers), it controls the + * maximum number of reads sharing the same alignment start position. For ReadWalkers you will typically need to use + * much lower dcov values than you would with LocusWalkers to see an effect. Note that this downsampling option does + * not produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the * to-coverage downsampler is to maintain an even representation of reads from all alignment start positions when - * removing excess coverage. For a true across-the-board unbiased random sampling of reads, use -dfrac instead. Also - * note that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling - * algorithm will under some circumstances retain slightly more coverage than requested. + * removing excess coverage. For a truly unbiased random sampling of reads, use -dfrac instead. Also note + * that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling + * algorithm will under some circumstances retain slightly more or less coverage than requested. */ @Argument(fullName = "downsample_to_coverage", shortName = "dcov", - doc = "Coverage [integer] to downsample to", + doc = "Coverage [integer] to downsample to per locus (for locus walkers) or per alignment start position (for read walkers)", required = false) public Integer downsampleCoverage = null; From 9498950b1c02cd77a2ab4a539f76a8f2ce2724c3 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Thu, 3 Oct 2013 16:52:48 -0400 Subject: [PATCH 37/77] Adding more specific error message when one of the scripts doesn't exist. --Previously it gave a cryptic message: ----IO error while decoding blarg.script with UTF-8 ----Please try specifying another one using the -encoding option --- .../src/org/broadinstitute/sting/queue/QScriptManager.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala index c6b8eff13..80dd53302 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala @@ -36,6 +36,7 @@ import org.apache.log4j.Level import org.broadinstitute.sting.queue.util.TextFormatUtils._ import org.broadinstitute.sting.utils.classloader.JVMUtils import scala.reflect.internal.util.{FakePos, NoPosition, Position, StringOps} +import org.broadinstitute.sting.utils.exceptions.UserException /** * Plugin manager for QScripts which loads QScripts into the current class loader. @@ -46,6 +47,11 @@ class QScriptManager() extends Logging { * Heavily based on scala/src/compiler/scala/tools/ant/Scalac.scala */ def loadScripts(scripts: Seq[File], tempDir: File) { + // Make sure the scripts actually exist. + scripts.foreach{ + file => if( !file.exists()) throw new UserException.CouldNotReadInputFile(file, "it does not exist.") + } + if (scripts.size > 0) { val settings = new Settings((error: String) => logger.error(error)) settings.deprecation.value = true From 5ed47988b83de0c17cc9b1065522a4aa14f8818b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 24 Oct 2013 17:13:25 -0400 Subject: [PATCH 38/77] Changed the parameter names from cds to baits Making the usage more clear since the parameter is being used over and over to define baited regions. Updated the headers accordingly and made it more readable. --- .../missing/QualifyMissingIntervals.java | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java index 52a92d9ff..54fc6e97e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java @@ -126,11 +126,10 @@ public final class QualifyMissingIntervals extends LocusWalker public String targetsFile; /** - * List of coding sequence intervals (exons) if different from the targets file, to distinguish intervals - * that overlap the cds and intervals that don't. + * List of baits to distinguish untargeted intervals from those that are targeted but not covered */ - @Argument(shortName = "cds", required = false) - public String cdsFile = null; + @Argument(shortName = "baits", required = false) + public String baitsFile = null; /** * This value will be used to determine whether or not an interval had too high or too low GC content to be @@ -183,8 +182,8 @@ public final class QualifyMissingIntervals extends LocusWalker } GATKReport simpleReport; - GenomeLocSortedSet target; - GenomeLocSortedSet cds; + GenomeLocSortedSet targets; + GenomeLocSortedSet baits; public boolean isReduceByInterval() { return true; @@ -192,13 +191,13 @@ public final class QualifyMissingIntervals extends LocusWalker public void initialize() { // if cds file is not provided, just use the targets file (no harm done) - if (cdsFile == null) - cdsFile = targetsFile; + if (baitsFile == null) + baitsFile = targetsFile; - simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "DP", "TP", "TS", "CD", "LN", "DS"); + simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "INTERVAL", "GC", "BQ", "MQ", "DP", "POS_IN_TARGET", "TARGET_SIZE", "BAITED", "MISSING_SIZE", "INTERPRETATION"); final GenomeLocParser parser = getToolkit().getGenomeLocParser(); - target = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, targetsFile)); - cds = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, cdsFile)); + targets = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, targetsFile)); + baits = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, baitsFile)); } public Metrics reduceInit() { @@ -241,7 +240,7 @@ public final class QualifyMissingIntervals extends LocusWalker for (Pair r : results) { final GenomeLoc interval = r.getFirst(); final Metrics metrics = r.getSecond(); - final List overlappingIntervals = target.getOverlapping(interval); + final List overlappingIntervals = targets.getOverlapping(interval); simpleReport.addRow( interval.toString(), @@ -251,7 +250,7 @@ public final class QualifyMissingIntervals extends LocusWalker metrics.depth(), getPositionInTarget(interval, overlappingIntervals), getTargetSize(overlappingIntervals), - cds.overlaps(interval), + baits.overlaps(interval), interval.size(), interpret(metrics, interval) ); From 209f2a61aa7e8f1e2903c0e25261edcee948507a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 29 Oct 2013 10:33:51 -0400 Subject: [PATCH 39/77] Updated the GATK bundle script to: 1. Include exome target list for b37 2. Not delete the 'current' link unless -run is applied to the command line! (sorry, Ryan) --- .../queue/qscripts/GATKResourcesBundle.scala | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 1736adc17..d3e94d306 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -95,6 +95,7 @@ class GATKResourcesBundle extends QScript { def isBAM(file: File) = file.getName.endsWith(".bam") def isOUT(file: File) = file.getName.endsWith(".out") def isFASTA(file: File) = file.getName.endsWith(".fasta") + def isIntervalList(file: File) = file.getName.endsWith(".interval_list") var RESOURCES: List[Resource] = Nil def addResource(comp: Resource) { RESOURCES = comp :: RESOURCES } @@ -180,11 +181,17 @@ class GATKResourcesBundle extends QScript { "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) // - // Test BAM file, specific to each reference + // Test BAM file, only for the b37 reference // addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.NA12878.bam", "IGNORE", b37, false, false)) + // + // Exome targets file, only for the b37 reference + // + addResource(new Resource("/seq/references/HybSelOligos/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", + "Broad.human.exome", b37, true, false, false)) + // // refGene files specific to each reference // @@ -217,7 +224,7 @@ class GATKResourcesBundle extends QScript { val currentLink = new File(BUNDLE_ROOT + "/current") - if ( currentLink.exists ) currentLink.delete() + if ( currentLink.exists ) add(new deleteLink(currentLink)) add(new linkFile(bundleDir, currentLink)) } @@ -275,6 +282,9 @@ class GATKResourcesBundle extends QScript { } } } + } else if ( isIntervalList(resource.file) ) { + val out = destFile(BUNDLE_DIR, resource.ref, resource.destname(resource.ref)) + add(new cpFile(resource.file, out)) } else { //throw new ReviewedStingException("Unknown file type: " + resource) } @@ -354,6 +364,10 @@ class GATKResourcesBundle extends QScript { def commandLine = "cp %s %s".format(in.getAbsolutePath, out.getAbsolutePath) } + class deleteLink(@Input val in: File) extends CommandLineFunction { + def commandLine = "rm %s".format(in.getAbsolutePath) + } + class linkFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { def commandLine = "ln -s %s %s".format(in.getAbsolutePath, out.getAbsolutePath) } From b22c9c2cb46b3218f1121e0adcb3b3525ec0191b Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 30 Sep 2013 10:17:43 -0400 Subject: [PATCH 40/77] Improvements to the reference model pipeline. -- We use the RegenotypeVariants walker to recompute the qual field. (instead of the discussed idea of adding this functionality to CombineVariants) -- QualByDepth will now be recomputed even if the stratified contexts are missing. This greatly improves the QD estimate for this pipeline. Doesn't work for multi-allelics since the qual can't be recomputed. --- .../gatk/walkers/annotator/QualByDepth.java | 15 +++--- .../gatk/walkers/annotator/RankSumTest.java | 2 +- .../VariantAnnotatorIntegrationTest.java | 48 +++++++++++++++++++ .../walkers/variantutils/CombineVariants.java | 2 - 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index a3fbcc439..906cfa021 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -54,6 +54,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.coverage.DepthOfCoverage; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -94,19 +96,20 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( !genotype.isHet() && !genotype.isHomVar() ) continue; - if (stratifiedContexts!= null) { - AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) { + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; depth += context.getBasePileup().depthOfCoverage(); - } - else if (perReadAlleleLikelihoodMap != null) { - PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); + } else if (perReadAlleleLikelihoodMap != null) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) continue; depth += perReadAlleleLikelihoods.getNumberOfStoredElements(); + } else if (genotype.hasDP() && vc.isBiallelic()) { // TODO -- this currently only works with biallelic variants for now because multiallelics have had their PLs stripped out and therefore their qual score can't be recomputed + depth += genotype.getDP(); } } @@ -116,7 +119,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength); QD = fixTooHighQD(QD); - Map map = new HashMap(); + Map map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); return map; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 1ba13afa1..ab5a40145 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -83,7 +83,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null + // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null final GenotypesContext genotypes = vc.getGenotypes(); if (genotypes == null || genotypes.size() == 0) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 9f8b72c1d..58c3bb9bd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -345,4 +345,52 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { Assert.assertFalse(lineIterator.hasNext()); Assert.assertFalse(lineIteratorAnn.hasNext()); } + + @Test + public void testQualByDepth() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("testQualByDepth", spec).getFirst().get(0); + + final String baseNoQD = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA QualByDepth"; + final WalkerTestSpec specNoQD = new WalkerTestSpec(baseNoQD, 1, Arrays.asList("")); + specNoQD.disableShadowBCF(); + final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0); + + final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth"; + final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("139a4384f5a7c1f49ada67f416642249")); + specAnn.disableShadowBCF(); + final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0); + + // confirm that the QD values are present in the new file for all biallelic variants + // QD values won't be identical because some filtered reads are missing during re-annotation + + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + final VCFCodec codecAnn = new VCFCodec(); + final FileInputStream sAnn = new FileInputStream(outputVCFAnn); + final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); + codecAnn.readHeader(lineIteratorAnn); + + while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + final String lineAnn = lineIteratorAnn.next(); + Assert.assertFalse(lineAnn == null); + final VariantContext vcAnn = codecAnn.decode(lineAnn); + + if( vc.isBiallelic() ) { + Assert.assertTrue(vc.hasAttribute("QD")); + Assert.assertTrue(vcAnn.hasAttribute("QD")); + } + } + + Assert.assertFalse(lineIterator.hasNext()); + Assert.assertFalse(lineIteratorAnn.hasNext()); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 396d5686b..1362b109e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -235,8 +235,6 @@ public class CombineVariants extends RodWalker implements Tree vcfWriter.writeHeader(vcfHeader); } - - private void validateAnnotateUnionArguments() { Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); From 96024403bf9dbcb32de249d448363f4163a49e66 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 4 Nov 2013 10:01:22 -0500 Subject: [PATCH 41/77] Update the dbsnp version in the bundle from 137 to 138; resolves PT #59771004. --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index d3e94d306..307ce171f 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -149,8 +149,8 @@ class GATKResourcesBundle extends QScript { // // standard VCF files. Will be lifted to each reference // - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_137_b37.leftAligned.vcf", - "dbsnp_137", b37, true, false)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_138_b37.leftAligned.vcf", + "dbsnp_138", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_2141_samples.b37.vcf", "1000G_omni2.5", b37, true, false)) From 2fc40a0aedd95086043c69bf1ea7b5eee38b03c4 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 7 Nov 2013 09:02:17 -0500 Subject: [PATCH 43/77] Fixing the liftover script to not require strict VCF header validation. Apparently no one has used the liftover script for a while (which I guess is a good thing)... --- .../sting/gatk/walkers/variantutils/LiftoverVariants.java | 3 ++- public/perl/liftOverVCF.pl | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 379b1c2a3..478a2a351 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.writer.Options; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -118,7 +119,7 @@ public class LiftoverVariants extends RodWalker { final VCFHeader vcfHeader = new VCFHeader(metaData, samples); - writer = VariantContextWriterFactory.create(file, getMasterSequenceDictionary(), VariantContextWriterFactory.NO_OPTIONS); + writer = VariantContextWriterFactory.create(file, getMasterSequenceDictionary(), EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER)); writer.writeHeader(vcfHeader); } diff --git a/public/perl/liftOverVCF.pl b/public/perl/liftOverVCF.pl index ba4198292..a942145d7 100755 --- a/public/perl/liftOverVCF.pl +++ b/public/perl/liftOverVCF.pl @@ -36,7 +36,7 @@ my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; # lift over the file print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict -U LENIENT_VCF_PROCESSING"; if ($recordOriginalLocation) { $cmd .= " -recordOriginalLocation"; } @@ -66,7 +66,7 @@ system($cmd) == 0 or quit("The sorting step failed. Please correct the necessar # Filter the VCF for bad records print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out -U LENIENT_VCF_PROCESSING"; system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying."); # clean up From 725656ae7e613e1d610029909c3615408d2e8733 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 22 Aug 2013 22:18:47 -0400 Subject: [PATCH 45/77] Generalizing the FullProcessingPipeline Qscript We have generalized the processing script to be able to handle multiple scenarios. Originally it was designed for PCR free data only, we added all the steps necessary to start from fastq and process RNA-seq as well as non-human data. This is our go to script in TechDev. * add optional "starting from fastq" path to the pipeline * add mark duplicates (optionally) to the pipeline * add an option to run with the mouse data (without dbsnp and with single ended fastq) * add option to process RNA-seq data from topHat (add RG and reassign mapping quality if necessary) * add option to filter or include reads with N in the cigar string * add parameter to allow keeping the intermediate files --- .../sting/queue/extensions/gatk/GATKExtensionsGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index ced0809f7..bf675503b 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -325,7 +325,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { * @throws IOException If the file cannot be written. */ private void writeFile(String fullClassName, String content) throws IOException { - File outputFile = new File(outputDirectory, fullClassName.replace(".", "/") + ".scala"); + File outputFile = new File(outputDirectory, fullClassName.replace(".", "/") + "MoleculoPipeline.scala"); if (outputFile.exists()) { String existingContent = FileUtils.readFileToString(outputFile); if (StringUtils.equals(content, existingContent)) From 296bcc7fb14369f2ec3e40229e47273798e220aa Mon Sep 17 00:00:00 2001 From: Phillip Dexheimer Date: Tue, 12 Nov 2013 11:42:09 -0500 Subject: [PATCH 46/77] Changed name of jobs submitted to cluster job runners -- Added 'jobRunnerJobName' definition to QFunction, defaults to value of shortDescription -- Edited Lsf and Drmaa JobRunners to use this string instead of description for naming jobs in the scheduler Signed-off-by: Joel Thibault --- .../sting/queue/engine/drmaa/DrmaaJobRunner.scala | 2 +- .../sting/queue/engine/lsf/Lsf706JobRunner.scala | 2 +- .../org/broadinstitute/sting/queue/function/QFunction.scala | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala index 79fc8589f..b405c91a2 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala @@ -50,7 +50,7 @@ class DrmaaJobRunner(val session: Session, val function: CommandLineFunction) ex session.synchronized { val drmaaJob: JobTemplate = session.createJobTemplate - drmaaJob.setJobName(function.description.take(jobNameLength).replaceAll(jobNameFilter, "_")) + drmaaJob.setJobName(function.jobRunnerJobName.take(jobNameLength).replaceAll(jobNameFilter, "_")) // Set the current working directory drmaaJob.setWorkingDirectory(function.commandDirectory.getPath) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index ead29bbf5..e9f141880 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -71,7 +71,7 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR for (i <- 0 until LibLsf.LSF_RLIM_NLIMITS) request.rLimits(i) = LibLsf.DEFAULT_RLIMIT; - request.jobName = function.description.take(LibBat.MAX_JOB_NAME_LEN) + request.jobName = function.jobRunnerJobName.take(LibBat.MAX_JOB_NAME_LEN) request.options |= LibBat.SUB_JOB_NAME // Set the output file for stdout diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index abbb63271..3afd289af 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -150,6 +150,11 @@ trait QFunction extends Logging with QJobReport { case _ => analysisName } } + + /** + * The name of the job as submitted to the job runner + */ + def jobRunnerJobName = shortDescription /** * Returns true if the function is done. From dac3dbc997cb6778b0eb88e87f7be9c15afdb46b Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Fri, 11 Oct 2013 10:23:38 -0400 Subject: [PATCH 47/77] Improved gatkdocs for InbreedingCoefficient, ReduceReads, ErrorRatePerCycle Clarified caveat for InbreedingCoefficient Cleaned up docstrings for ReduceReads Brushed up doc for ErrorRatePerCycle --- .../walkers/annotator/InbreedingCoeff.java | 4 +- .../compression/reducereads/ReduceReads.java | 80 +++++++++---------- .../diagnostics/ErrorRatePerCycle.java | 23 +++--- 3 files changed, 54 insertions(+), 53 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index da2143ec1..3f815346d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -70,8 +70,8 @@ import java.util.*; * * A continuous generalization of the Hardy-Weinberg test for disequilibrium that works * well with limited coverage per sample. See the 1000 Genomes Phase I release for - * more information. Note that the Inbreeding Coefficient will not be calculated for files - * with fewer than a minimum (generally 10) number of samples. + * more information. Note that the Inbreeding Coefficient can only be calculated for + * cohorts containing at least 10 founder samples. */ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 31fe7e380..383ba5ee9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -84,10 +84,10 @@ import java.util.List; * Reduces the BAM file using read based compression that keeps only essential information for variant calling * *

- * This walker will generated reduced versions of the BAM files that still follow the BAM spec - * and contain all the information necessary for the GSA variant calling pipeline. Some options - * allow you to tune in how much compression you want to achieve. The default values have been - * shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the + * This tool will generate reduced versions of the BAM files that still follow the BAM specification + * and contain all the information necessary to call variants according to the GATK Best Practices recommendations. + * Some options allow you to tune how much compression you want to achieve. The default values have been + * shown to reduce a typical whole exome BAM file by 100x. The higher the coverage, the bigger the * savings in file size and performance of the downstream tools. * *

Input

@@ -121,25 +121,25 @@ public class ReduceReads extends ReadWalker, Redu private SAMFileWriter writerToUse = null; /** - * The number of bases to keep around mismatches (potential variation) + * */ - @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false) + @Argument(fullName = "context_size", shortName = "cs", doc = "The number of bases to keep around mismatches (potential variation)", required = false) public int contextSize = 10; /** - * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have + * Reads that have * mapping quality below this threshold will not be counted towards consensus, but are still counted * towards variable regions. */ - @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false) + @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "The minimum mapping quality to be considered for the consensus synthetic read", required = false) public int minMappingQuality = 20; /** - * The minimum base quality to be considered for the consensus synthetic read. Reads that have + * Reads that have * base quality below this threshold will not be counted towards consensus, but are still counted * towards variable regions. */ - @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false) + @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "The minimum base quality to be considered for the consensus synthetic read", required = false) public byte minBaseQual = 15; /** @@ -160,81 +160,77 @@ public class ReduceReads extends ReadWalker, Redu public List> known = Collections.emptyList(); /** - * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals - * and read group). + * This strips away all extra information of the read -- anything other than bases, quals + * and read group. */ - @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false) + @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "Do not simplify read", required = false) public boolean DONT_SIMPLIFY_READS = false; /** - * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired. - * The program will behave correctly in those cases. + * Note that it is not necessary to turn this on for reads that are not mate paired. + * The program will behave correctly by default in those cases. */ - @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false) + @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "Do not hard clip adaptor sequences", required = false) public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; /** - * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail + * This option overrides the argument of minimum tail * quality. */ - @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false) + @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "Do not hard clip the low quality tails of the reads", required = false) public boolean DONT_CLIP_LOW_QUAL_TAILS = false; /** - * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped + * By default, ReduceReads will hard clip away any low quality soft clipped * base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual) */ - @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false) + @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "Do not use high quality soft-clipped bases", required = false) public boolean DONT_USE_SOFTCLIPPED_BASES = false; /** - * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee + * By default, ReduceReads will compress read names to numbers and guarantee * uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. */ - @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false) + @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "Do not compress read names", required = false) public boolean DONT_COMPRESS_READ_NAMES = false; /** - * Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval - * border. + * The hard clips will happen exactly at the interval border. */ - @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false) + @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "Hard clip all incoming reads to the desired intervals", required = false) public boolean HARD_CLIP_TO_INTERVAL = false; /** - * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be + * Anything below this will be * considered consensus and reduced (otherwise we will try to trigger polyploid compression). Note that * this value is used only regions with high coverage. */ @Advanced - @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) + @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "Minimum proportion of mismatches in a site to trigger a variant region", required = false) public double minAltProportionToTriggerVariant = 0.05; /** - * Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region. * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to * trigger polyploid compression). Note that this value is used only regions with low coverage. */ @Advanced - @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false) + @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region", required = false) public double minAltPValueToTriggerVariant = 0.01; /** - * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be - * considered consensus. + * Anything below this will be considered consensus. */ - @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false) + @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "Minimum proportion of indels in a site to trigger a variant region", required = false) public double minIndelProportionToTriggerVariant = 0.05; /** - * The number of reads emitted per sample in a variant region can be downsampled for better compression. * This level of downsampling only happens after the region has been evaluated, therefore it can * be combined with the engine level downsampling. * A value of 0 turns downsampling off. */ - @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false) + @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "Downsample the number of reads emitted per sample in a variant region for better compression", required = false) public int downsampleCoverage = 250; /** @@ -243,27 +239,27 @@ public class ReduceReads extends ReadWalker, Redu * To prevent users from unintentionally running the tool in a less than ideal manner, we require them * to explicitly enable multi-sample analysis with this argument. */ - @Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "enable multi-samples reduction for cancer analysis", required = false) + @Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "Enable multi-sample reduction for cancer analysis", required = false) public boolean ALLOW_MULTIPLE_SAMPLES = false; @Hidden - @Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false) + @Argument(fullName = "nwayout", shortName = "nw", doc = "Generate separate output files per input file", required = false) public boolean nwayout = false; @Hidden - @Argument(fullName = "", shortName = "dl", doc = "", required = false) + @Argument(fullName = "", shortName = "dl", doc = "Debug level", required = false) public int debugLevel = 0; @Hidden - @Argument(fullName = "", shortName = "dr", doc = "", required = false) + @Argument(fullName = "", shortName = "dr", doc = "Debug read", required = false) public String debugRead = ""; @Hidden - @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false) + @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "Downsampling strategy", required = false) public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; @Hidden - @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Discard program tags", required = false) public boolean NO_PG_TAG = false; public enum DownsampleStrategy { @@ -297,7 +293,7 @@ public class ReduceReads extends ReadWalker, Redu throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes"); if ( nwayout && out != null ) - throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other"); + throw new UserException.CommandLineException("--out and --nwayout cannot be used simultaneously; please use one or the other"); if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 ) throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); @@ -306,7 +302,7 @@ public class ReduceReads extends ReadWalker, Redu throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); if ( SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()).size() > 1 && !ALLOW_MULTIPLE_SAMPLES ) - throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis"); + throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis. If that is what you want to do, use the -cancer_mode flag."); if ( known.isEmpty() ) knownSnpPositions = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java index 86676ca54..42e3ae0c0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -44,10 +44,11 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; /** - * Computes the read error rate per position in read (in the original 5'->3' orientation that the read had coming off the machine) + * Compute the read error rate per position * - * Emits a GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate for each read - * group in the input BAMs FOR ONLY THE FIRST OF PAIR READS. + *

This tool computes the read error rate per position in sequence reads. It does this in the original 5'->3' + * orientation that the read had coming off the machine. It then emits a GATKReport containing readgroup, cycle, + * mismatches, counts, qual, and error rate for each read group in the input BAMs.

* *

Input

*

@@ -56,9 +57,9 @@ import java.io.PrintStream; * *

Output

*

- * GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate. + * A GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate. * - * For example, running this tool on the NA12878 data sets: + * For example, running this tool on the NA12878 data sets yields the following table: * *

  *      ##:GATKReport.v0.2 ErrorRatePerCycle : The error rate per sequenced position in the reads
@@ -82,16 +83,20 @@ import java.io.PrintStream;
  *      
*

* - *

Examples

+ *

Example

*
  *    java
  *      -jar GenomeAnalysisTK.jar
  *      -T ErrorRatePerCycle
- *      -I bundle/current/b37/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam
- *      -R bundle/current/b37/human_g1k_v37.fasta
- *      -o example.gatkreport.txt
+ *      -R human_g1k_v37.fasta
+ *      -I my_sequence_reads.bam
+ *      -o error_rates.gatkreport.txt
  *  
* + *

Caveat

+ * + *

Note that when it is run on paired-end sequence data, this tool only uses the first read in a pair.

+ * * @author Kiran Garimella, Mark DePristo */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) From e40a07bb586752f23da345d3f0be69357fb8289b Mon Sep 17 00:00:00 2001 From: bradtaylor Date: Mon, 21 Oct 2013 14:08:03 -0400 Subject: [PATCH 48/77] Improve the PairHMM API for better FPGA integration Motivation: The API was different between the regular PairHMM and the FPGA-implementation via CnyPairHMM. As a result, the LikelihoodCalculationEngine had to use account for this. The goal is to change the API to be the same for all implementations, and make it easier to access. PairHMM PairHMM now accepts a list of reads and a map of alleles/haplotpes and returns a PerReadAlleleLikelihoodMap. Added a new primary method that loops the reads and haplotypes, extracts qualities, and passes them to the computeReadLikelihoodGivenHaplotypeLog10 method. Did not alter that method, or its subcompute method, at all. PairHMM also now handles its own (re)initialization, so users don't have to worry about that. CnyPairHMM Added that same new primary access method to this FPGA class. Method overrides the default implementation in PairHMM. Walks through a list of reads. Individual-read quals and the full haplotype list are fed to batchAdd(), as before. However, instead of waiting for every read to get added, and then walking through the reads again to extract results, we just get the haplotype-results array for each read as soon as it is generated, and pack it into a perReadAlleleLikelihoodMap for return. The main access method is now the same no matter whether the FPGA CnyPairHMM is used or not. LikelihoodCalculationEngine The functionality to loop through the reads and haplotypes and get individual log10-likelihoods was moved to the PairHMM, and so removed from here. However, this class does need to retain the ability to pre-process the reads, and post-process the resulting likelihoods map. Those features were separated from running the HMM and refactored into their own methods Commented out the (unused) system for finding best N haplotypes for genotyping. PairHMMIndelErrorModel Similar changes were made as to the LCE. However, in this case the haplotypes are modified based on each individual read, so the read-list we feed into the HMM only has one read. --- .../LikelihoodCalculationEngine.java | 546 +++++++++--------- .../indels/PairHMMIndelErrorModel.java | 150 +++-- .../utils/pairhmm/ArrayLoglessPairHMM.java | 1 + .../sting/utils/pairhmm/CnyPairHMM.java | 67 ++- .../sting/utils/pairhmm/LoglessPairHMM.java | 11 - .../genotyper/PerReadAlleleLikelihoodMap.java | 25 +- .../sting/utils/pairhmm/Log10PairHMM.java | 3 - .../sting/utils/pairhmm/N2MemoryPairHMM.java | 7 +- .../sting/utils/pairhmm/PairHMM.java | 87 ++- .../sting/utils/sam/GATKSAMRecord.java | 51 +- 10 files changed, 547 insertions(+), 401 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 0d55797bc..4eb728390 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -52,18 +52,10 @@ import net.sf.samtools.SAMUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator; -import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; -import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM; -import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM; -import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -88,7 +80,7 @@ public class LikelihoodCalculationEngine { private final PairHMM.HMM_IMPLEMENTATION hmmType; private final boolean noFpga; - private final ThreadLocal pairHMM = new ThreadLocal() { + private final ThreadLocal pairHMMThreadLocal = new ThreadLocal() { @Override protected PairHMM initialValue() { switch (hmmType) { @@ -109,6 +101,8 @@ public class LikelihoodCalculationEngine { } } }; +// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException +// private final PairHMM pairHMM = pairHMMThreadLocal.get(); private final static boolean WRITE_LIKELIHOODS_TO_FILE = false; private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; @@ -173,36 +167,145 @@ public class LikelihoodCalculationEngine { if ( likelihoodsStream != null ) likelihoodsStream.close(); } - /** - * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate - * - * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples - * against the set of haplotypes - * - * @param haplotypes a non-null list of haplotypes - * @param perSampleReadList a mapping from sample -> reads - */ - private void initializePairHMM(final List haplotypes, final Map> perSampleReadList) { - int X_METRIC_LENGTH = 0; - for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - for( final GATKSAMRecord read : sample.getValue() ) { - final int readLength = read.getReadLength(); - if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; } - } + private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ + if ( WRITE_LIKELIHOODS_TO_FILE ) { + likelihoodsStream.printf("%s %s %s %s %s %s %f%n", + haplotype.getBaseString(), + new String(processedRead.getReadBases() ), + SAMUtils.phredToFastq(processedRead.getBaseQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ), + SAMUtils.phredToFastq(constantGCP), + log10l); } - int Y_METRIC_LENGTH = 0; - for( final Haplotype h : haplotypes ) { - final int haplotypeLength = h.getBases().length; - if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } - } - - // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases - pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); } + private Map createAlleleMap(List haplotypes){ + final int numHaplotypes = haplotypes.size(); + final Map alleleMap = new LinkedHashMap<>(numHaplotypes); + for ( final Haplotype haplotype : haplotypes ) { + final Allele allele = Allele.create(haplotype, true); + alleleMap.put(allele, haplotype); + } + return alleleMap; + } + + private Map fillGCPArrays(List reads){ + final Map GCPArrayMap = new LinkedHashMap<>(); + for (GATKSAMRecord read: reads){ + byte [] GCPArray = new byte[read.getReadBases().length]; + Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + GCPArrayMap.put(read, GCPArray); + } + return GCPArrayMap; + } + + private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) { + for( int kkk = 0; kkk < readQuals.length; kkk++ ) { + readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG + readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); + readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); + } + } + + /** + * Pre-processing of the reads to be evaluated at the current location from the current sample. + * We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read. + * Modified copies of reads are packed into a new list, while original reads are retained for downstream use + * + * @param reads The original list of unmodified reads + * @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding + */ + private List modifyReadQualities(final List reads) { + List processedReads = new LinkedList<>(); + for ( GATKSAMRecord read : reads ) { + + final byte[] readBases = read.getReadBases(); + + // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read + final byte[] readQuals = read.getBaseQualities().clone(); + final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); + final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + + applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals); + + // Create a new copy of the read and sets its base qualities to the modified versions. + // Pack this into a new list for return + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals); + processedReads.add(processedRead); + } + return processedReads; + } + + /** + * Post-processing of the read/allele likelihoods. + * + * We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads. + * We wish to return a map containing the original, unmodified reads. + * + * At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate. + * This protects us from the case where the assembly has produced haplotypes + * that are very divergent from reference, but are supported by only one read. In effect + * we capping how badly scoring the reference can be for any read by the chance that the read + * itself just doesn't belong here + * + * @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls + * @param reads Our original, unmodified reads + * @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls + * @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype + * + * @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls + */ + private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List reads, List processedReads, Map alleleHaplotypeMap){ + + // a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls + final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + + Allele refAllele = null; + final int numReads = reads.size(); + for (int readIndex = 0; readIndex < numReads; readIndex++) { + + // Get the original and quality-modified read from their respective lists + // Note that this requires both lists to have reads in the same order + final GATKSAMRecord originalRead = reads.get(readIndex); + final GATKSAMRecord processedRead = processedReads.get(readIndex); + + // keep track of the reference likelihood and the best non-ref likelihood + double refLog10l = Double.NEGATIVE_INFINITY; + double bestNonReflog10L = Double.NEGATIVE_INFINITY; + + for ( Allele allele : alleleHaplotypeMap.keySet() ) { + final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele); + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + if ( haplotype.isNonReference() ) + bestNonReflog10L = Math.max(bestNonReflog10L, log10l); + else { + refAllele = allele; + refLog10l = log10l; + } + writeDebugLikelihoods(processedRead, haplotype, log10l); + + // add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l + processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l); + } + + // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global + // mismapping rate. This protects us from the case where the assembly has produced haplotypes + // that are very divergent from reference, but are supported by only one read. In effect + // we capping how badly scoring the reference can be for any read by the chance that the read + // itself just doesn't belong here + final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; + if ( refLog10l < (worstRefLog10Allowed) ) { + processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed); + } + } + return processedReadAlleleLikelihoodMap; + } + + public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { - // configure the HMM - initializePairHMM(haplotypes, perSampleReadList); // Add likelihoods for each sample's reads to our stratifiedReadMap final Map stratifiedReadMap = new LinkedHashMap<>(); @@ -218,137 +321,22 @@ public class LikelihoodCalculationEngine { } private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { - // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time) - final BatchPairHMM batchPairHMM = (pairHMM.get() instanceof BatchPairHMM) ? (BatchPairHMM)pairHMM.get() : null; - final Vector batchedReads = new Vector(reads.size()); - final int numHaplotypes = haplotypes.size(); - final Map alleleVersions = new LinkedHashMap<>(numHaplotypes); - Allele refAllele = null; - for ( final Haplotype haplotype : haplotypes ) { - final Allele allele = Allele.create(haplotype, true); - alleleVersions.put(haplotype, allele); - if ( haplotype.isReference() ) refAllele = allele; - } - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - for( final GATKSAMRecord read : reads ) { + // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities + List processedReads = modifyReadQualities(reads); - final byte[] readBases = read.getReadBases(); - final byte[] overallGCP = new byte[read.getReadLength()]; - Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + // Get alleles corresponding to our haplotypees + Map alleleHaplotypeMap = createAlleleMap(haplotypes); - // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read - final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); - final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + // Get an array containing the constantGCP for each read in our modified read list + Map GCPArrayMap = fillGCPArrays(processedReads); - applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype + PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap); - for( int kkk = 0; kkk < readQuals.length; kkk++ ) { - readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); - readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); - readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); - } + // Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate - if ( batchPairHMM != null ) { - batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); - batchedReads.add(read); - continue; - } - - // keep track of the reference likelihood and the best non-ref likelihood - double refLog10l = Double.NEGATIVE_INFINITY; - double bestNonReflog10L = Double.NEGATIVE_INFINITY; - - // iterate over all haplotypes, calculating the likelihood of the read for each haplotype - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - final Haplotype haplotype = haplotypes.get(jjj); - final byte[] nextHaplotypeBases = (jjj == numHaplotypes - 1) ? null : haplotypes.get(jjj+1).getBases(); - final boolean isFirstHaplotype = jjj == 0; - final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); - - if ( WRITE_LIKELIHOODS_TO_FILE ) { - likelihoodsStream.printf("%s %s %s %s %s %s %f%n", - haplotype.getBaseString(), - new String(readBases), - SAMUtils.phredToFastq(readQuals), - SAMUtils.phredToFastq(readInsQuals), - SAMUtils.phredToFastq(readDelQuals), - SAMUtils.phredToFastq(overallGCP), - log10l); - } - - if ( haplotype.isNonReference() ) - bestNonReflog10L = Math.max(bestNonReflog10L, log10l); - else - refLog10l = log10l; - - perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); - } - - // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global - // mismapping rate. This protects us from the case where the assembly has produced haplotypes - // that are very divergent from reference, but are supported by only one read. In effect - // we capping how badly scoring the reference can be for any read by the chance that the read - // itself just doesn't belong here - final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; - if ( refLog10l < (worstRefLog10Allowed) ) { - perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed); - } - } - - if ( batchPairHMM != null ) { - for( final GATKSAMRecord read : batchedReads ) { - double refLog10l = Double.NEGATIVE_INFINITY; - double bestNonReflog10L = Double.NEGATIVE_INFINITY; - final double[] likelihoods = batchPairHMM.batchGetResult(); - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - final Haplotype haplotype = haplotypes.get(jjj); - final double log10l = likelihoods[jjj]; - - if ( WRITE_LIKELIHOODS_TO_FILE ) { - final byte[] overallGCP = new byte[read.getReadLength()]; - Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? - // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read - final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); - for( int kkk = 0; kkk < readQuals.length; kkk++ ) { - readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated - //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated - // TODO -- why is Q18 hard-coded here??? - readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); - } - likelihoodsStream.printf("%s %s %s %s %s %s %f%n", - haplotype.getBaseString(), - new String(read.getReadBases()), - SAMUtils.phredToFastq(readQuals), - SAMUtils.phredToFastq(readInsQuals), - SAMUtils.phredToFastq(readDelQuals), - SAMUtils.phredToFastq(overallGCP), - log10l); - } - - if ( haplotype.isNonReference() ) - bestNonReflog10L = Math.max(bestNonReflog10L, log10l); - else - refLog10l = log10l; - - - perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); - } - - final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; - if ( refLog10l < (worstRefLog10Allowed) ) { - perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed); - } - } - } - - return perReadAlleleLikelihoodMap; + return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap); } @Requires({"alleleOrdering.size() > 0"}) @@ -421,125 +409,125 @@ public class LikelihoodCalculationEngine { // System to compute the best N haplotypes for genotyping // // -------------------------------------------------------------------------------- - - /** - * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele - * @param map an annoying map object that moves us between the allele and haplotype representation - * @param haplotypeAsAllele the allele version of the haplotype - * @return the haplotype version, with its score incremented by 1 if its non-reference - */ - private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { - final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic - if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value - return h; - } - - /** - * Take the best N haplotypes and return them as a list - * - * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample - * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing - * order of score (so higher score haplotypes are preferred). The N we take is determined by - * - * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) - * - * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is - * bounded by maxNumHaplotypesInPopulation as that number can grow without bound - * - * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 - * @param nSamples the number of samples used to select the haplotypes - * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples - * @return a list of N or fewer haplotypes, with the reference haplotype first - */ - private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { - final List selectedHaplotypesList = new ArrayList(selectedHaplotypes); - Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); - final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; - final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); - final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); - if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); - return bestHaplotypes; - } - - /** - * Select the best haplotypes for genotyping the samples in stratifiedReadMap - * - * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely - * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for - * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get - * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation - * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the - * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. - * - * @param haplotypes a list of all haplotypes we're considering - * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype - * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes - * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation - */ - public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { - if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); - - if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes - - // all of the haplotypes that at least one sample called as one of the most likely - final Set selectedHaplotypes = new HashSet<>(); - selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected - - // our annoying map from allele -> haplotype - final Map allele2Haplotype = new HashMap<>(); - for ( final Haplotype h : haplotypes ) { - h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes - allele2Haplotype.put(Allele.create(h, h.isReference()), h); - } - - // for each sample, compute the most likely pair of haplotypes - for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { - // get the two most likely haplotypes under a diploid model for this sample - final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); - - if ( mla != null ) { // there was something to evaluate in this sample - // note that there must be at least 2 haplotypes - final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); - final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); - -// if ( DEBUG ) { -// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +// +// /** +// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele +// * @param map an annoying map object that moves us between the allele and haplotype representation +// * @param haplotypeAsAllele the allele version of the haplotype +// * @return the haplotype version, with its score incremented by 1 if its non-reference +// */ +// private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { +// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic +// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value +// return h; +// } +// +// /** +// * Take the best N haplotypes and return them as a list +// * +// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample +// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing +// * order of score (so higher score haplotypes are preferred). The N we take is determined by +// * +// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) +// * +// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is +// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound +// * +// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 +// * @param nSamples the number of samples used to select the haplotypes +// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples +// * @return a list of N or fewer haplotypes, with the reference haplotype first +// */ +// private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { +// final List selectedHaplotypesList = new ArrayList<>(selectedHaplotypes); +// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); +// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; +// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); +// final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); +// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); +// return bestHaplotypes; +// } +// +// /** +// * Select the best haplotypes for genotyping the samples in stratifiedReadMap +// * +// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely +// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for +// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get +// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation +// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the +// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. +// * +// * @param haplotypes a list of all haplotypes we're considering +// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype +// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes +// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation +// */ +// public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { +// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); +// +// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes +// +// // all of the haplotypes that at least one sample called as one of the most likely +// final Set selectedHaplotypes = new HashSet<>(); +// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected +// +// // our annoying map from allele -> haplotype +// final Map allele2Haplotype = new HashMap<>(); +// for ( final Haplotype h : haplotypes ) { +// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes +// allele2Haplotype.put(Allele.create(h, h.isReference()), h); +// } +// +// // for each sample, compute the most likely pair of haplotypes +// for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { +// // get the two most likely haplotypes under a diploid model for this sample +// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); +// +// if ( mla != null ) { // there was something to evaluate in this sample +// // note that there must be at least 2 haplotypes +// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); +// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); +// +//// if ( DEBUG ) { +//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +//// } +// +// // add these two haplotypes to the set of haplotypes that have been selected +// selectedHaplotypes.add(best); +// selectedHaplotypes.add(second); +// +// // we've already selected all of our haplotypes, and we don't need to prune them down +// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) +// break; // } - - // add these two haplotypes to the set of haplotypes that have been selected - selectedHaplotypes.add(best); - selectedHaplotypes.add(second); - - // we've already selected all of our haplotypes, and we don't need to prune them down - if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) - break; - } - } - - // take the best N haplotypes forward, in order of the number of samples that choose them - final int nSamples = stratifiedReadMap.size(); - final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); - - if ( DEBUG ) { - logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); - for ( final Haplotype h : bestHaplotypes ) { - logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); - } - } - return bestHaplotypes; - } - - /** - * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found - * @param haplotypes non-null list of haplotypes - * @return the reference haplotype - */ - private static Haplotype findReferenceHaplotype( final List haplotypes ) { - for( final Haplotype h : haplotypes ) { - if( h.isReference() ) return h; - } - throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); - } +// } +// +// // take the best N haplotypes forward, in order of the number of samples that choose them +// final int nSamples = stratifiedReadMap.size(); +// final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); +// +// if ( DEBUG ) { +// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); +// for ( final Haplotype h : bestHaplotypes ) { +// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); +// } +// } +// return bestHaplotypes; +// } +// +// /** +// * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found +// * @param haplotypes non-null list of haplotypes +// * @return the reference haplotype +// */ +// private static Haplotype findReferenceHaplotype( final List haplotypes ) { +// for( final Haplotype h : haplotypes ) { +// if( h.isReference() ) return h; +// } +// throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); +// } // -------------------------------------------------------------------------------- // diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 3c6e409b9..97431368e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -48,11 +48,11 @@ package org.broadinstitute.sting.gatk.walkers.indels; import com.google.java.contract.Ensures; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; @@ -65,6 +65,7 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; import java.util.LinkedHashMap; +import java.util.LinkedList; import java.util.Map; //import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM; @@ -206,6 +207,39 @@ public class PairHMMIndelErrorModel { } } + private LinkedHashMap trimHaplotypes(final LinkedHashMap haplotypeMap, + long startLocationInRefForHaplotypes, + long stopLocationInRefForHaplotypes, + final ReferenceContext ref){ + + final LinkedHashMap trimmedHaplotypeMap = new LinkedHashMap<>(); + for (final Allele a: haplotypeMap.keySet()) { + + final Haplotype haplotype = haplotypeMap.get(a); + + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) + startLocationInRefForHaplotypes = haplotype.getStopPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); + + if (DEBUG) + System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n", + indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); + + // get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map + final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); + final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference()); + trimmedHaplotypeMap.put(a, trimmedHaplotype); + } + return trimmedHaplotypeMap; + } + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, final LinkedHashMap haplotypeMap, @@ -231,6 +265,8 @@ public class PairHMMIndelErrorModel { final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; + final LinkedList readList = new LinkedList<>(); + final Map readGCPArrayMap = new LinkedHashMap<>(); int readIdx=0; for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations @@ -369,86 +405,30 @@ public class PairHMMIndelErrorModel { baseDeletionQualities = contextLogGapOpenProbabilities; } - byte[] currentHaplotypeBases = null; - boolean firstHap = true; - double readLikelihood; - Allele currentAllele = null; - for (Allele a: haplotypeMap.keySet()) { + // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); + readList.add(processedRead); - Haplotype haplotype = haplotypeMap.get(a); + // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM + readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); - if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) - stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the approprate genomic locations + final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); - if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) - startLocationInRefForHaplotypes = haplotype.getStartPosition(); - else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) - startLocationInRefForHaplotypes = haplotype.getStopPosition(); + // Get the likelihoods for our clipped read against each of our trimmed haplotypes. + final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); - final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); - final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); - - - if (DEBUG) - System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - - // peak at the next haplotype in the list - final byte[] nextHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - // process the current haplotype in the list - if (currentHaplotypeBases != null) { - // it's possible that the indel starts at the last base of the haplotypes - if ( currentHaplotypeBases.length == 0 ) { - readLikelihood = -Double.MAX_VALUE; - } else { - if (firstHap) { - //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(readBases.length, currentHaplotypeBases.length); - firstHap = false; - } - - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, nextHaplotypeBases); - } - - if (DEBUG) { - System.out.println("H:"+new String(currentHaplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - } - - perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood); - readLikelihoods[readIdx][j++] = readLikelihood; - } - // update the current haplotype - currentHaplotypeBases = nextHaplotypeBases; - currentAllele = a; - } - // process the final haplotype - if (currentHaplotypeBases != null) { - // it's possible that the indel starts at the last base of the haplotypes - if ( currentHaplotypeBases.length == 0 ) { - readLikelihood = -Double.MAX_VALUE; - } else { - if (firstHap) { - //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(readBases.length, currentHaplotypeBases.length); - firstHap = false; - } - // there is no next haplotype, so pass null for nextHaplotypeBases. - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, null); - } - - if (DEBUG) { - System.out.println("H:"+new String(currentHaplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - } - - perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood); + // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array + for (Allele a: trimmedHaplotypeMap.keySet()){ + double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); + perReadAlleleLikelihoodMap.add(p, a, readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } + // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually + readList.remove(processedRead); + + // The same is true for the read/GCP-array map + readGCPArrayMap.remove(processedRead); } } readIdx++; @@ -472,16 +452,16 @@ public class PairHMMIndelErrorModel { return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); } - private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { - if (b1.length != b2.length) - return 0; // sanity check - - for (int i=0; i < b1.length; i++ ){ - if ( b1[i]!= b2[i] ) - return i; - } - return b1.length; - } +// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { +// if (b1.length != b2.length) +// return 0; // sanity check +// +// for (int i=0; i < b1.length; i++ ){ +// if ( b1[i]!= b2[i] ) +// return i; +// } +// return b1.length; +// } private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java index 4b996e770..a693ec22d 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java @@ -49,6 +49,7 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.QualityUtils; + import java.util.Arrays; /** diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java index d92b918ba..b80036bb2 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java @@ -47,12 +47,16 @@ package org.broadinstitute.sting.utils.pairhmm; import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; import java.io.File; import java.lang.reflect.Field; import java.util.Arrays; import java.util.LinkedList; import java.util.List; +import java.util.Map; public final class CnyPairHMM extends PairHMM implements BatchPairHMM { private static class HmmInput { @@ -62,14 +66,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { public byte[] deletionGOP; public byte[] overallGCP; public List haplotypes; - }; + } public static class ResultQueue { private int offset; private List batchResults; public ResultQueue() { - batchResults = new LinkedList(); + batchResults = new LinkedList<>(); offset = 0; } @@ -92,7 +96,7 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { final static String libName = "gmvhdl_gatk_hmm"; private static boolean loaded = false; - private List batchRequests = new LinkedList(); + private List batchRequests = new LinkedList<>(); private ResultQueue resultQueue = new ResultQueue(); static public boolean isAvailable() { @@ -184,6 +188,55 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { return results; } + /** + * {@inheritDoc} + */ + @Override + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap){ + + // initialize the pairHMM if necessary + if (! initialized) { + int readMaxLength = findMaxReadLength(reads); + int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + initialize(readMaxLength, haplotypeMaxLength); + } + + // Pass the read bases/quals, and the haplotypes as a list into the HMM + performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap); + + // Get the log10-likelihoods for each read/haplotype ant pack into the results map + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap); + + return likelihoodMap; + } + + private void collectLikelihoodResults(List reads, Map alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) { + for(final GATKSAMRecord read : reads){ + final double[] likelihoods = batchGetResult(); + int jjj = 0; + for (Allele allele : alleleHaplotypeMap.keySet()){ + final double log10l = likelihoods[jjj]; + likelihoodMap.add(read, allele, log10l); + jjj++; + } + } + } + + private void performBatchAdditions(List reads, Map alleleHaplotypeMap, Map GCPArrayMap) { + final List haplotypeList = getHaplotypeList(alleleHaplotypeMap); + for(final GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); + } + } + + protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, @@ -196,6 +249,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { return 0.0; } + private List getHaplotypeList(Map alleleHaplotypeMap){ + final List haplotypeList = new LinkedList<>(); + for (Allele a : alleleHaplotypeMap.keySet()){ + haplotypeList.add(alleleHaplotypeMap.get(a)); + } + return haplotypeList; + } + private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) { double[] results = null; int n = dequeueRequirement(haplotypeBases.length, readBases.length); diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index e745ca1f5..de2994927 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -70,17 +70,6 @@ public final class LoglessPairHMM extends N2MemoryPairHMM { private static final int deletionToDeletion = 5; - /** - * {@inheritDoc} - */ - @Override - public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - } - /** * {@inheritDoc} */ diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 70be85f54..af22eb7f9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -110,7 +110,7 @@ public class PerReadAlleleLikelihoodMap { * @return a map from each allele to a list of reads that 'support' the allele */ protected Map> getAlleleStratifiedReadMap() { - final Map> alleleReadMap = new HashMap>(alleles.size()); + final Map> alleleReadMap = new HashMap<>(alleles.size()); for ( final Allele allele : alleles ) alleleReadMap.put(allele, new ArrayList()); @@ -152,7 +152,7 @@ public class PerReadAlleleLikelihoodMap { /** * Does the current map contain the key associated with a particular SAM record in pileup? * @param p Pileup element - * @return + * @return true if the map contains pileup element, else false */ public boolean containsPileupElement(final PileupElement p) { return likelihoodReadMap.containsKey(p.getRead()); @@ -176,9 +176,9 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.keySet(); } - public Collection> getLikelihoodMapValues() { - return likelihoodReadMap.values(); - } +// public Collection> getLikelihoodMapValues() { +// return likelihoodReadMap.values(); +// } public int getNumberOfStoredElements() { return likelihoodReadMap.size(); @@ -191,6 +191,21 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.get(p.getRead()); } + + /** + * Get the log10 likelihood associated with an individual read/allele + * + * @param read the read whose likelihood we want + * @param allele the allele whose likelihood we want + * @return the log10 likelihood that this read matches this allele + */ + public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){ + if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read)) + return 0.0; + + return likelihoodReadMap.get(read).get(allele); + } + /** * Get the most likely alleles estimated across all reads in this object * diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java index e7bc5cb56..83b87da95 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -85,9 +85,6 @@ public final class Log10PairHMM extends N2MemoryPairHMM { Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); } - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java index a091a0716..c02461c03 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java @@ -26,10 +26,6 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.Arrays; /** * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix @@ -58,6 +54,9 @@ abstract class N2MemoryPairHMM extends PairHMM { matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + transition = new double[paddedMaxReadLength][6]; + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index eb52f4a85..ce3d43a06 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -28,9 +28,14 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; - +import java.util.List; +import java.util.Map; /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * @@ -59,7 +64,7 @@ public abstract class PairHMM { protected int maxHaplotypeLength, maxReadLength; protected int paddedMaxReadLength, paddedMaxHaplotypeLength; protected int paddedReadLength, paddedHaplotypeLength; - private boolean initialized = false; + protected boolean initialized = false; // only used for debugging purposes protected boolean doNotUseTristateCorrection = false; @@ -73,7 +78,7 @@ public abstract class PairHMM { * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM * @param readMaxLength the max length of reads we want to use with this PairHMM */ - public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + protected void initialize( final int readMaxLength, final int haplotypeMaxLength ) { if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); @@ -90,6 +95,79 @@ public abstract class PairHMM { initialized = true; } + protected int findMaxReadLength(final List reads) { + int listMaxReadLength = 0; + for(GATKSAMRecord read : reads){ + final int readLength = read.getReadLength(); + if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } + } + return listMaxReadLength; + } + + protected int findMaxHaplotypeLength(final Map haplotypeMap) { + int listMaxHaplotypeLength = 0; + for( final Allele a: haplotypeMap.keySet() ) { + final Haplotype h = haplotypeMap.get(a); + final int haplotypeLength = h.getBases().length; + if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } + } + return listMaxHaplotypeLength; + } + + /** + * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from + * each haplotype given base substitution, insertion, and deletion probabilities. + * + * @param reads the list of reads + * @param alleleHaplotypeMap the list of haplotypes + * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. + * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of + * said read coming from the said haplotype under the provided error model + */ + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { + + // (re)initialize the pairHMM only if necessary + final int readMaxLength = findMaxReadLength(reads); + final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } + + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + for(GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) + byte[] currentHaplotypeBases = null; + boolean isFirstHaplotype = true; + Allele currentAllele = null; + double log10l; + for (final Allele allele : alleleHaplotypeMap.keySet()){ + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + final byte[] nextHaplotypeBases = haplotype.getBases(); + if (currentHaplotypeBases != null) { + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); + likelihoodMap.add(read, currentAllele, log10l); + } + // update the current haplotype + currentHaplotypeBases = nextHaplotypeBases; + currentAllele = allele; + } + // process the final haplotype + if (currentHaplotypeBases != null) { + + // there is no next haplotype, so pass null for nextHaplotypeBases. + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); + likelihoodMap.add(read, currentAllele, log10l); + } + } + return likelihoodMap; + } + /** * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * probabilities. @@ -110,7 +188,7 @@ public abstract class PairHMM { * parameters are the same, and only the haplotype bases are changing underneath us * @return the log10 probability of read coming from the haplotype under the provided error model */ - public final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final byte[] insertionGOP, @@ -118,6 +196,7 @@ public abstract class PairHMM { final byte[] overallGCP, final boolean recacheReadValues, final byte[] nextHaploytpeBases) { + if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c39245730..93718b04d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -139,7 +139,7 @@ public class GATKSAMRecord extends BAMRecord { } public static GATKSAMRecord createRandomRead(int length) { - List cigarElements = new LinkedList(); + List cigarElements = new LinkedList<>(); cigarElements.add(new CigarElement(length, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); return ArtificialSAMUtils.createArtificialRead(cigar); @@ -536,10 +536,7 @@ public class GATKSAMRecord extends BAMRecord { * @return True if an attribute has been set for this key. */ public boolean containsTemporaryAttribute(Object key) { - if(temporaryAttributes != null) { - return temporaryAttributes.containsKey(key); - } - return false; + return temporaryAttributes != null && temporaryAttributes.containsKey(key); } /** @@ -556,7 +553,7 @@ public class GATKSAMRecord extends BAMRecord { */ public Object setTemporaryAttribute(Object key, Object value) { if(temporaryAttributes == null) { - temporaryAttributes = new HashMap(); + temporaryAttributes = new HashMap<>(); } return temporaryAttributes.put(key, value); } @@ -750,6 +747,46 @@ public class GATKSAMRecord extends BAMRecord { return emptyRead; } + /** + * Creates a new GATKSAMRecord with the source read's header, read group and mate + * information, but with the following fields set to user-supplied values: + * - Read Bases + * - Base Qualities + * - Base Insertion Qualities + * - Base Deletion Qualities + * + * Cigar string is empty (not-null) + * + * Use this method if you want to create a new GATKSAMRecord based on + * another GATKSAMRecord, but with modified bases and qualities + * + * @param read a read to copy the header from + * @param readBases an array containing the new bases you wish use in place of the originals + * @param baseQualities an array containing the new base qualities you wish use in place of the originals + * @param baseInsertionQualities an array containing the new base insertion qaulities + * @param baseDeletionQualities an array containing the new base deletion qualities + * @return a read with modified bases and qualities, safe for the GATK + */ + public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read, + final byte[] readBases, + final byte[] baseQualities, + final byte[] baseInsertionQualities, + final byte[] baseDeletionQualities) { + if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length ) + throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length + + " vs Base Q's:" + baseQualities.length + + " vs Insert Q's:" + baseInsertionQualities.length + + " vs Delete Q's:" + baseDeletionQualities.length); + + final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read); + processedRead.setReadBases(readBases); + processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION); + processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION); + processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION); + + return processedRead; + } + /** * Shallow copy of everything, except for the attribute list and the temporary attributes. * A new list of the attributes is created for both, but the attributes themselves are copied by reference. @@ -762,7 +799,7 @@ public class GATKSAMRecord extends BAMRecord { public Object clone() throws CloneNotSupportedException { final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); if (temporaryAttributes != null) { - clone.temporaryAttributes = new HashMap(); + clone.temporaryAttributes = new HashMap<>(); for (Object attribute : temporaryAttributes.keySet()) clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); } From 9c1023c933cb34a5368c58643659bfcbcdcc86f5 Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Mon, 18 Nov 2013 11:44:24 -0500 Subject: [PATCH 50/77] fix a (ugly) weird error from last commit that changed all the scala files to end with MoleculoPipeline.scala --- .../sting/queue/extensions/gatk/GATKExtensionsGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index bf675503b..ced0809f7 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -325,7 +325,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { * @throws IOException If the file cannot be written. */ private void writeFile(String fullClassName, String content) throws IOException { - File outputFile = new File(outputDirectory, fullClassName.replace(".", "/") + "MoleculoPipeline.scala"); + File outputFile = new File(outputDirectory, fullClassName.replace(".", "/") + ".scala"); if (outputFile.exists()) { String existingContent = FileUtils.readFileToString(outputFile); if (StringUtils.equals(content, existingContent)) From 6ad841cec575024fd186b9d0aeb7f8f69f49ee9d Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Thu, 7 Nov 2013 14:54:35 -0500 Subject: [PATCH 51/77] Rewrite ReadLengthDistribution to count the read lengths into a hash table first and only at the end to produce a GATK report table. Before that fix, the tool was couldn't work with more then one RG before. - Address all review comments --- .../diagnostics/ReadLengthDistribution.java | 101 +++++++++++++----- 1 file changed, 77 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java index a269a94bc..796c817ff 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java @@ -38,7 +38,10 @@ import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.TreeMap; /** * Outputs the read lengths of all the reads in a file. @@ -77,51 +80,101 @@ public class ReadLengthDistribution extends ReadWalker { @Output public PrintStream out; - private GATKReport report; + //A map from RG to its column number (its index in an int[] array) + private Map readGroupsLocation; + //Each line in the table is a read length and each column it the number of reads of a specific RG with that length. Thus a table is a map between read lengths to array of values (one for each RG). + private Map table; + private List readGroups; public void initialize() { - final List readGroups = getToolkit().getSAMFileHeader().getReadGroups(); + readGroups = getToolkit().getSAMFileHeader().getReadGroups(); + readGroupsLocation = new HashMap<>(); + table = new TreeMap<>(); + int readGroupsNum = 0; - report = new GATKReport(); - report.addTable("ReadLengthDistribution", "Table of read length distributions", 1 + (readGroups.isEmpty() ? 1 : readGroups.size())); - GATKReportTable table = report.getTable("ReadLengthDistribution"); - - table.addColumn("readLength"); - - if (readGroups.isEmpty()) - table.addColumn("SINGLE_SAMPLE"); - else - for (SAMReadGroupRecord rg : readGroups) - table.addColumn(rg.getSample()); - } - - public boolean filter(ReferenceContext ref, GATKSAMRecord read) { - return ( !read.getReadPairedFlag() || read.getReadPairedFlag() && read.getFirstOfPairFlag()); + if (!readGroups.isEmpty()){ + for (SAMReadGroupRecord rg : readGroups){ + readGroupsLocation.put(rg,readGroupsNum); + readGroupsNum++; + } + } } @Override - public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, RefMetaDataTracker RefMetaDataTracker) { - GATKReportTable table = report.getTable("ReadLengthDistribution"); + public Integer map(final ReferenceContext referenceContext,final GATKSAMRecord samRecord,final RefMetaDataTracker RefMetaDataTracker) { - int length = Math.abs(samRecord.getReadLength()); - String sample = samRecord.getReadGroup().getSample(); + final int length = Math.abs(samRecord.getReadLength()); + final SAMReadGroupRecord rg = samRecord.getReadGroup(); - table.increment(length, sample); + increment(table,length, rg); return null; } + final private void increment(final Map table,final int length,final SAMReadGroupRecord rg){ + if(readGroupsLocation.isEmpty()){ + if(table.containsKey(length)) + table.get(length)[0]++; + else{ + final int[] newLength = {1}; + table.put(length,newLength); + } + } + else{ + final int rgLocation = readGroupsLocation.get(rg); + if(table.containsKey(length)) + table.get(length)[rgLocation]++; + else{ + table.put(length,new int[readGroupsLocation.size()]); + table.get(length)[rgLocation]++; + } + } + } + @Override public Integer reduceInit() { return null; } @Override - public Integer reduce(Integer integer, Integer integer1) { + public Integer reduce(final Integer integer,final Integer integer1) { return null; } - public void onTraversalDone(Integer sum) { + public void onTraversalDone(final Integer sum) { + final GATKReport report = createGATKReport(); report.print(out); } + + final private GATKReport createGATKReport(){ + final GATKReport report = new GATKReport(); + report.addTable("ReadLengthDistribution", "Table of read length distributions", 1 + (readGroupsLocation.isEmpty() ? 1 : readGroupsLocation.size())); + final GATKReportTable tableReport = report.getTable("ReadLengthDistribution"); + + tableReport.addColumn("readLength"); + + if (readGroupsLocation.isEmpty()){ + tableReport.addColumn("SINGLE_SAMPLE"); + int rowIndex = 0; + for (Integer length : table.keySet()){ + tableReport.set(rowIndex,0,length); + tableReport.set(rowIndex,1,table.get(length)[0]); + rowIndex++; + } + } + else{ + for (SAMReadGroupRecord rg : readGroups) + tableReport.addColumn(rg.getSample()); + int rowIndex = 0; + for (Integer length : table.keySet()){ + tableReport.set(rowIndex,0,length); + for (int i=0; i < readGroupsLocation.size(); i++) + tableReport.set(rowIndex,i+1,table.get(length)[i]); + rowIndex++; + } + + } + + return report; + } } From e6ef37de1de3a1659f3c9a267f462d1fc70f46dc Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Thu, 7 Nov 2013 17:16:14 -0500 Subject: [PATCH 52/77] Add an option to filter the read bases that are taking into account for the coveraged intervals. For that, new two arguments were added: minBaseQuality and minMappingQuality --- .../walkers/diagnostics/FindCoveredIntervals.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java index ad6023579..bd69cbdbd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java @@ -101,11 +101,24 @@ public class FindCoveredIntervals extends ActiveRegionWalker { @Argument(fullName = "coverage_threshold", shortName = "cov", doc = "The minimum allowable coverage to be considered covered", required = false) private int coverageThreshold = 20; + @Argument(fullName = "minBaseQuality", shortName = "minBQ", doc = "The minimum allowable base quality score to be counted for coverage",required = false) + private int minBaseQuality = 0; + + @Argument(fullName = "minMappingQuality", shortName = "minMQ", doc = "The minimum allowable mapping quality score to be counted for coverage",required = false) + private int minMappingQuality = 0; + + + + @Override // Look to see if the region has sufficient coverage public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { - int depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage(); + int depth; + if(minBaseQuality == 0 && minMappingQuality == 0) + depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage(); + else + depth = context.getBasePileup().getBaseAndMappingFilteredPileup(minBaseQuality,minMappingQuality).depthOfCoverage(); // note the linear probability scale return new ActivityProfileState(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); From b42ccdce112bdc158adf8194950deb7fbc781254 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Tue, 19 Nov 2013 15:05:41 -0500 Subject: [PATCH 53/77] Tweaked gatkdocs index template --- settings/helpTemplates/common.html | 4 ++-- settings/helpTemplates/generic.index.template.html | 4 ++-- settings/helpTemplates/generic.template.html | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/settings/helpTemplates/common.html b/settings/helpTemplates/common.html index 677fdf861..f4fb74af1 100644 --- a/settings/helpTemplates/common.html +++ b/settings/helpTemplates/common.html @@ -58,7 +58,7 @@ @@ -82,7 +82,7 @@

See also Guide Index | - Technical Documentation Index | + Tool Documentation Index | Support Forum

diff --git a/settings/helpTemplates/generic.index.template.html b/settings/helpTemplates/generic.index.template.html index b3e3d0212..a5650d55e 100644 --- a/settings/helpTemplates/generic.index.template.html +++ b/settings/helpTemplates/generic.index.template.html @@ -53,8 +53,8 @@ -<@makeHeader title="Technical Documentation Index" isIndex=true /> -

Technical Documentation Index +<@makeHeader title="GATK | Tool Documentation Index" isIndex=true /> +

Tool Documentation Index ${version}

diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index b78bba48c..eea741669 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -88,7 +88,7 @@ -<@makeHeader title="${name} documentation" isIndex=false /> +<@makeHeader title="GATK | ${name} documentation" isIndex=false />