> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
- final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
- if (! a.isInformative() )
- continue; // read is non-informative
-
- final GATKSAMRecord read = el.getKey();
- if ( read.getSoftStart() + read.getCigar().getReadLength() <= refLoc ) { // make sure the read actually covers the requested ref loc
- continue;
- }
- final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true );
- if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED || read.getCigar() == null )
- continue;
- int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 );
- final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
- if (readPos > numAlignedBases / 2)
- readPos = numAlignedBases - (readPos + 1);
-
- if (a.getMostLikelyAllele().isReference())
- refQuals.add((double)readPos);
- else if (allAlleles.contains(a.getMostLikelyAllele()))
- altQuals.add((double)readPos);
- }
+ int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 );
+ final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
+ if (readPos > numAlignedBases / 2)
+ readPos = numAlignedBases - (readPos + 1);
+ return (double)readPos;
}
- int getFinalReadPosition(GATKSAMRecord read, int initialReadPosition) {
+ protected Double getElementForPileupElement(final PileupElement p) {
+ final int offset = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);
+ return (double)getFinalReadPosition(p.getRead(), offset);
+ }
+
+ @Override
+ protected boolean isUsableBase(final PileupElement p) {
+ return super.isUsableBase(p) && p.getRead().getCigar() != null;
+ }
+
+ @Override
+ protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
+ return super.isUsableRead(read, refLoc) && read.getSoftStart() + read.getCigar().getReadLength() > refLoc;
+ }
+
+ private int getFinalReadPosition(final GATKSAMRecord read, final int initialReadPosition) {
final int numAlignedBases = getNumAlignedBases(read);
int readPos = initialReadPosition;
@@ -141,7 +116,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
return readPos;
}
- int getNumClippedBasesAtStart(SAMRecord read) {
+
+ private int getNumClippedBasesAtStart(final SAMRecord read) {
// compute total number of clipped bases (soft or hard clipped)
// check for hard clips (never consider these bases):
final Cigar c = read.getCigar();
@@ -151,8 +127,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
if (first.getOperator() == CigarOperator.H) {
numStartClippedBases = first.getLength();
}
- byte[] unclippedReadBases = read.getReadBases();
- byte[] unclippedReadQuals = read.getBaseQualities();
+ final byte[] unclippedReadBases = read.getReadBases();
+ final byte[] unclippedReadQuals = read.getBaseQualities();
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
@@ -167,11 +143,11 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
return numStartClippedBases;
}
- int getNumAlignedBases(SAMRecord read) {
+ private int getNumAlignedBases(final GATKSAMRecord read) {
return read.getReadLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
}
- int getNumClippedBasesAtEnd(SAMRecord read) {
+ private int getNumClippedBasesAtEnd(final GATKSAMRecord read) {
// compute total number of clipped bases (soft or hard clipped)
// check for hard clips (never consider these bases):
final Cigar c = read.getCigar();
@@ -181,8 +157,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
if (last.getOperator() == CigarOperator.H) {
numEndClippedBases = last.getLength();
}
- byte[] unclippedReadBases = read.getReadBases();
- byte[] unclippedReadQuals = read.getBaseQualities();
+ final byte[] unclippedReadBases = read.getReadBases();
+ final byte[] unclippedReadQuals = read.getBaseQualities();
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
@@ -193,11 +169,6 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
break;
}
-
return numEndClippedBases;
}
-
- int getOffsetFromClippedReadStart(SAMRecord read, int offset) {
- return offset - getNumClippedBasesAtStart(read);
- }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java
new file mode 100644
index 000000000..7a7527dd1
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java
@@ -0,0 +1,583 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+package org.broadinstitute.sting.gatk.walkers.bqsr;
+
+import com.google.java.contract.Requires;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.recalibration.RecalUtils;
+import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
+import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+
+/**
+ * Tool to analyze and evaluate base recalibration ables.
+ *
+ * For now it generates a plot report to assess the quality of a recalibration.
+ *
+ * Input
+ *
+ * The tool can take up to three different sets of recalibration tables.
+ * The resulting plots will be overlaid on top of each other to make
+ * comparisons easy.
+ *
+ *
+ *
+ * | Set | Argument | Label | Color | Description |
+ *
+ *
+ * | Original | -before | BEFORE | Maroon1 |
+ * First pass recalibration
+ * tables obtained from applying {@link BaseRecalibration}
+ * on the original alignment. |
+ * | Recalibrated | -after | AFTER | Blue |
+ * Second pass recalibration tables
+ * results from the application of {@link BaseRecalibration}
+ * on the alignment recalibrated using the first pass tables |
+ * | Input | -BQSR | BQSR | Black |
+ * Any recalibration table without a specific role |
+ *
+ *
+ *
+ *
+ * You need to specify one set at least. Multiple sets need to have the same values for the following parameters:
+ *
+ * covariate (order is not important), no_standard_covs, run_without_dbsnp, solid_recal_mode,
+ * solid_nocall_strategy, mismatches_context_size, mismatches_default_quality, deletions_default_quality,
+ * insertions_default_quality, maximum_cycle_value, low_quality_tail, default_platform, force_platform,
+ * quantizing_levels and binary_tag_name
+ * Output
+ *
+ * Currently this tool generates two outputs:
+ *
+ *
+ * - -plots my-report.pdf
+ * - A pdf document that encloses plots to assess the quality of the recalibration.
+ * - -csv my-report.csv
+ * - A csv file that contains a table with all the data required to generate those plots.
+ *
+ *
+ * You need to specify at least one of them.
+ *
+ * Other Arguments
+ *
+ * -ignoreLMT, --ignoreLastModificationTimes
+ *
+ * when set, no warning message will be displayed in the -before recalibration table file is older than the -after one.
+ *
+ * Examples
+ *
+ *
+ * Plot a single recalibration table
+ *
+ * java -jar GenomeAnalysisTK.jar \
+ * -T AnalyzeCovariates \
+ * -R myrefernce.fasta \
+ * -BQSR myrecal.table \
+ * -plots BQSR.pdf
+ *
+ *
+ * Plot before (first pass) and after (second pass) recalibration table to compare them
+ *
+ *
+ * java -jar GenomeAnalysisTK.jar \
+ * -T AnalyzeCovariates \
+ * -R myrefernce.fasta \
+ * -before recal2.table \
+ * -after recal3.table \
+ * -plots recalQC.pdf
+ *
+ *
+ * Plot up to three recalibration tables for comparison
+ *
+ *
+ *
+ * # You can ignore the before/after semantics completely if you like (if you do add -ignoreLMT
+ * # to avoid a possible warning), but all tables should have been generated using the same parameters.
+ *
+ * java -jar GenomeAnalysisTK.jar \
+ * -T AnalyzeCovariates \
+ * -R myrefernce.fasta \
+ * -ignoreLMT \
+ * -BQSR recal1.table \ # you can discard any two
+ * -before recal2.table \
+ * -after recal3.table \
+ * -plots myrecals.pdf
+ *
+ *
+ * Full BQSR quality assessment pipeline
+ *
+ *
+ * # Generate the first pass recalibration table file.
+ * java -jar GenomeAnalysisTK.jar \
+ * -T BaseRecalibrator \
+ * -R myreference.fasta \
+ * -I myinput.bam \
+ * -knownSites bundle/my-trusted-snps.vcf \ # optional but recommendable
+ * -knownSites bundle/my-trusted-indels.vcf \ # optional but recommendable
+ * ... other options
+ * -o firstpass.table
+ *
+ * # Generate the second pass recalibration table file.
+ * java -jar GenomeAnalysisTK.jar \
+ * -T BaseRecalibrator \
+ * -BQSR firstpass.table \
+ * -R myreference.fasta \
+ * -I myinput.bam \
+ * -knownSites bundle/my-trusted-snps.vcf \
+ * -knownSites bundle/my-trusted-indels.vcf \
+ * ... other options \
+ * -o secondpass.table
+ *
+ * # Finally generate the plots report and also keep a copy of the csv (optional).
+ * java -jar GenomeAnalysisTK.jar \
+ * -T AnalyzeCovariates \
+ * -R myrefernce.fasta \
+ * -before firstpass.table \
+ * -after secondpass.table \
+ * -csv BQSR.csv \ # optional
+ * -plots BQSR.pdf
+ *
+ *
+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
+ * @version 6/16/2013
+ * @since 2.6
+ */
+public final class AnalyzeCovariates extends RodWalker {
+
+
+ // Constants on option short names that are used in some error/warning messages:
+
+ static final String CSV_ARG_SHORT_NAME = "csv";
+ static final String PDF_ARG_SHORT_NAME = "plots";
+ static final String BEFORE_ARG_SHORT_NAME = "before";
+ static final String AFTER_ARG_SHORT_NAME = "after";
+
+ /**
+ * File containing the recalibration tables from the first pass.
+ */
+ @Input(shortName=BEFORE_ARG_SHORT_NAME,fullName="beforeReportFile", doc = "file containing the BQSR first-pass report file",required = false)
+ protected File beforeFile = null;
+
+ /**
+ * File containing the recalibration tables from the second pass.
+ */
+ @Input(shortName=AFTER_ARG_SHORT_NAME, fullName="afterReportFile", doc = "file containing the BQSR second-pass report file",required = false)
+ protected File afterFile = null;
+
+ /**
+ * If true, it won't show a warning if the last-modification time of the before and after input files suggest that they have been reversed.
+ */
+ @Argument(shortName="ignoreLMT", fullName="ignoreLastModificationTimes", doc= "do not emit warning messages related to suspicious last modification time order of inputs", required = false)
+ protected boolean ignoreLastModificationTime = false;
+
+ /**
+ * Output report file name.
+ */
+ @Output(shortName=PDF_ARG_SHORT_NAME, fullName="plotsReportFile" ,doc = "location of the output report", required = false)
+ protected File pdfFile = null;
+
+ /**
+ * Output csv file name.
+ */
+ @Output(shortName=CSV_ARG_SHORT_NAME,fullName="intermediateCsvFile" ,doc = "location of the csv intermediate file", required = false)
+ protected File csvFile = null;
+
+ /**
+ * Convenience reference to the RECAL_BQSR_FILE argument value.
+ *
+ * This field value is resolved by {@link #initialize()}.
+ */
+ protected File bqsrFile = null;
+
+ /**
+ * Checks inputs and argument values.
+ *
+ * Notice that this routine will not validate the content of files. It may have some minor side effects as
+ * the output of warning messages back to the user.
+ *
+ * @throw IllegalStateException there is some required argument value that has not been loaded yet.
+ * @throw UserException if there is some error caused by or under the end user's control.
+ */
+ private void checkArgumentsValues() {
+ checkInputReportFile("BQSR",bqsrFile);
+ checkInputReportFile("before",beforeFile);
+ checkInputReportFile("after",afterFile);
+ if (bqsrFile == null && beforeFile == null && afterFile == null) {
+ throw new UserException("you must provide at least one recalibration report file "
+ + "(arguments -BQSR, -" + BEFORE_ARG_SHORT_NAME + " or -" + AFTER_ARG_SHORT_NAME);
+ }
+
+ checkOutputFile(PDF_ARG_SHORT_NAME,pdfFile);
+ checkOutputFile(CSV_ARG_SHORT_NAME, csvFile);
+ checkInputReportFileLMT(beforeFile,afterFile);
+ checkOutputRequested();
+ }
+
+ /**
+ * Checks whether the last-modification-time of the inputs is consistent with their relative roles.
+ *
+ * This routine does not thrown an exception but may output a warning message if inconsistencies are spotted.
+ *
+ * @param beforeFile the before report file.
+ * @param afterFile the after report file.
+ */
+ private void checkInputReportFileLMT(final File beforeFile, final File afterFile) {
+
+ if (ignoreLastModificationTime || beforeFile == null || afterFile == null) {
+ return; // nothing to do here
+ } else if (beforeFile.lastModified() > afterFile.lastModified()) {
+ Utils.warnUser("Last modification timestamp for 'Before' and 'After'"
+ + "recalibration reports are in the wrong order. Perhaps, have they been swapped?");
+ }
+ }
+
+ /**
+ * Checks that at least one output was requested.
+ *
+ * @throw UserException if no output was requested.
+ */
+ private void checkOutputRequested() {
+ if (pdfFile == null && csvFile == null) {
+ throw new UserException("you need to request at least one output:"
+ + " the intermediate csv file (-" + CSV_ARG_SHORT_NAME + " FILE)"
+ + " or the final plot file (-" + PDF_ARG_SHORT_NAME + " FILE).");
+ }
+ }
+
+ /**
+ * Checks the value provided to input file arguments.
+ *
+ * @throw UserException if there is any problem cause by or under the end user's control
+ *
+ * @param name command line argument short name.
+ * @param value the argument value.
+ */
+ private void checkInputReportFile(final String name,final File value) {
+ if (value == null) {
+ return;
+ } else if (!value.exists()) {
+ throw new UserException.BadArgumentValue(name, "input report '" +
+ value + "' does not exist or is unreachable");
+ } else if (!value.isFile()) {
+ throw new UserException.BadArgumentValue(name, "input report '" +
+ value + "' is not a regular file");
+ } else if (!value.canRead()) {
+ throw new UserException.BadArgumentValue(name, "input report '" +
+ value + "' cannot be read");
+ }
+ }
+
+ /**
+ * Checks the value provided for output arguments.
+ *
+ * @throw UserException if there is any problem cause by or under the end user's control
+ *
+ * @param name command line argument short name.
+ * @param value the argument value.
+ */
+ private void checkOutputFile(final String name, final File value) {
+ if (value == null) {
+ return;
+ }
+ if (value.exists() && !value.isFile()) {
+ throw new UserException.BadArgumentValue(name, "the output file location '"
+ + value + "' exists as not a file");
+ }
+ final File parent = value.getParentFile();
+ if (parent == null) {
+ return;
+ }
+ if (!parent.exists()) {
+ throw new UserException.BadArgumentValue(name, "the output file parent directory '"
+ + parent + "' does not exists or is unreachable");
+ } else if (!parent.isDirectory()) {
+ throw new UserException.BadArgumentValue(name, "the output file parent directory '"
+ + parent + "' is not a directory");
+ } else if (!parent.canWrite()) {
+ throw new UserException.BadArgumentValue(name, "the output file parent directory '"
+ + parent + "' cannot be written");
+ }
+
+ }
+
+ /**
+ * Generates the plots using the external R script.
+ *
+ *
+ * If plotsFile is null, it does not perform any plotting.
+ *
+ * @param csvFile the intermediary csv file.
+ * @param plotsFile the output plot location.
+ */
+ private void generatePlots(final File csvFile, final Map reportFiles, final File plotsFile) {
+
+ if (plotsFile == null) {
+ return;
+ }
+ logger.info("Generating plots file '" + plotsFile + "'");
+ final File exampleReportFile = reportFiles.values().iterator().next();
+ RecalUtils.generatePlots(csvFile,exampleReportFile,plotsFile);
+ }
+
+ @Override
+ public void initialize() {
+ super.initialize();
+ bqsrFile = getToolkit().getArguments().BQSR_RECAL_FILE;
+ checkArgumentsValues();
+ final Map reportFiles = buildReportFileMap();
+ final Map reports = buildReportMap(reportFiles);
+ checkReportConsistency(reports);
+ final File csvFile = resolveCsvFile();
+ generateCsvFile(csvFile,reports);
+ final File plotFile = resolvePlotFile();
+ generatePlots(csvFile, reportFiles, plotFile);
+ }
+
+ /**
+ * Returns the plot output file
+ * @return might be null if the user has not indicated and output file.
+ */
+ private File resolvePlotFile() {
+ return pdfFile;
+ }
+
+ /**
+ * Generates the intermediary Csv file.
+ *
+ * @param csvFile where to write the file.
+ * @param reports the reports to be included.
+ */
+ private void generateCsvFile(final File csvFile, final Map reports) {
+ try {
+ logger.info("Generating csv file '" + csvFile + "'");
+ RecalUtils.generateCsv(csvFile, reports);
+ } catch (FileNotFoundException e) {
+ throw new UserException(
+ String.format("There is a problem creating the intermediary Csv file '%s': %s",
+ csvFile,e.getMessage()),e);
+ }
+ }
+
+ /**
+ * Checks whether multiple input recalibration report files argument values are consistent (equal).
+ *
+ * @param reports map with report to verify.
+ *
+ * @throw UserException if there is any inconsistency.
+ */
+ private void checkReportConsistency(final Map reports) {
+ final Map.Entry[] reportEntries =
+ reports.entrySet().toArray((Map.Entry[]) new Map.Entry[reports.size()]);
+
+ final Map.Entry exampleEntry = reportEntries[0];
+
+ for (int i = 1; i < reportEntries.length; i++) {
+ final Map diffs = exampleEntry.getValue().getRAC().compareReportArguments(
+ reportEntries[i].getValue().getRAC(),exampleEntry.getKey(),reportEntries[i].getKey());
+ if (diffs.size() != 0) {
+ throw new UserException.IncompatibleRecalibrationTableParameters("There are differences in relevant arguments of"
+ + " two or more input recalibration reports. Please make sure"
+ + " they have been created using the same recalibration parameters."
+ + " " + Utils.join("// ", reportDifferencesStringArray(diffs)));
+ }
+ }
+ }
+
+
+ /**
+ * Creates a map with all input recalibration files indexed by their "role".
+ *
+ * The key is the role and the value the corresponding report file.
+ *
+ * Roles: "Before" (recalibration), "After" (recalibration), "BQSR" (the tool standard argument recalibration file)
+ *
+ * @return never null
+ */
+ private Map buildReportFileMap() {
+ final Map reports = new LinkedHashMap<>(3);
+ if (bqsrFile != null) {
+ reports.put("BQSR",bqsrFile);
+ }
+ if (beforeFile != null) {
+ reports.put("Before",beforeFile);
+ }
+ if (afterFile != null) {
+ reports.put("After",afterFile);
+ }
+ return reports;
+ }
+
+ /**
+ * Transforms a recalibration file map into a report object map.
+ *
+ * @param reportFileMap the file map to transforms.
+ * @return never null, a new map with the same size as
+ * reportFileMap and the same key set.
+ */
+ @Requires("reportFileMap != null")
+ private Map buildReportMap(final Map reportFileMap) {
+ final Map reports = new LinkedHashMap<>(reportFileMap.size());
+ for (final Map.Entry e : reportFileMap.entrySet()) {
+ reports.put(e.getKey(),new RecalibrationReport(e.getValue()));
+ }
+ return reports;
+ }
+
+ /**
+ * Generates a flatter String array representation of recalibration argument differences.
+ * @param diffs the differences to represent.
+ *
+ * @return never null, an array of the same length as the size of the input diffs.
+ */
+ @Requires("diffs != null")
+ private String[] reportDifferencesStringArray(final Map diffs) {
+ final String[] result = new String[diffs.size()];
+ int i = 0;
+ for (final Map.Entry e : diffs.entrySet()) {
+ result[i++] = capitalize(e.getKey()) + ": " + e.getValue();
+ }
+ return result;
+ }
+
+ /**
+ * Returns the input string capitalizing the first letter.
+ *
+ * @param str the string to capitalize
+ * @return never null.
+ */
+ @Requires("str != null")
+ private String capitalize(final String str) {
+ if (str.isEmpty()) {
+ return str;
+ } else {
+ return Character.toUpperCase(str.charAt(0)) + str.substring(1);
+ }
+ }
+
+ /**
+ * Returns the csv file to use.
+ *
+ * This is the the one specified by the user if any or a temporary file
+ * that will be deleted as soon as the VM exists by default.
+ *
+ * @return never null.
+ */
+ private File resolveCsvFile() {
+ if (csvFile != null) {
+ return csvFile;
+ } else {
+ try {
+ final File result = File.createTempFile("AnalyzeCovariates", ".csv");
+ result.deleteOnExit();
+ return result;
+ } catch (IOException e) {
+ throw new UserException("Could not create temporary Csv file",e);
+ }
+ }
+ }
+
+ /**
+ * Always return true, forcing the immediate termination of the travesal.
+ * @return
+ */
+ @Override
+ public boolean isDone() {
+ return true;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public None reduceInit() {
+ return new None();
+ }
+
+ /**
+ * Is not supposed to ever be called, thus it always results in an exception.
+ *
+ * @throws IllegalStateException always.
+ */
+ @Override
+ public None reduce(None value, None sum) {
+ throw new IllegalStateException("AnalyzeCovariates reduce method is not supposed to be invoked ever");
+ }
+
+
+ /**
+ * Is not supposed to ever be called, thus it always results in an exception.
+ *
+ * @throws IllegalStateException always.
+ */
+ @Override
+ public None map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ throw new IllegalStateException("AnalyzeCovariates map method is not supposed to be invoked ever");
+ }
+
+ /**
+ * Dummy map and reduce types for the {@link AnalyzeCovariates} tool that in fact does not do any traversal.
+ */
+ protected static class None {
+ private None() {
+ }
+ }
+}
+
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java
index ad97dc008..d6f0e16e8 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java
@@ -92,18 +92,6 @@ public class BQSRGatherer extends Gatherer {
generalReport.calculateQuantizedQualities();
- RecalibrationArgumentCollection RAC = generalReport.getRAC();
- if ( RAC.RECAL_PDF_FILE != null ) {
- RAC.RECAL_TABLE_FILE = output;
- if ( RAC.existingRecalibrationReport != null ) {
- final RecalibrationReport originalReport = new RecalibrationReport(RAC.existingRecalibrationReport);
- RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates());
- }
- else {
- RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates());
- }
- }
-
generalReport.output(outputFile);
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
index dde49b7db..3882b70fa 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
@@ -61,6 +61,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
@@ -124,7 +125,7 @@ import java.util.List;
* -R resources/Homo_sapiens_assembly18.fasta \
* -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
* -knownSites another/optional/setOfSitesToMask.vcf \
- * -o recal_data.grp
+ * -o recal_data.table
*
*/
@@ -366,9 +367,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche
}
protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) {
- final byte[] readBases = read.getReadBases();
- final int[] indel = new int[readBases.length];
- Arrays.fill(indel, 0);
+ final int[] indel = new int[read.getReadBases().length];
int readPos = 0;
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
final int elementLength = ce.getLength();
@@ -383,21 +382,19 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche
}
case D:
{
- final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) );
- indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 );
+ final int index = ( read.getReadNegativeStrandFlag() ? readPos : readPos - 1 );
+ updateIndel(indel, index, mode, EventType.BASE_DELETION);
break;
}
case I:
{
final boolean forwardStrandRead = !read.getReadNegativeStrandFlag();
if( forwardStrandRead ) {
- indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
- }
- for (int iii = 0; iii < elementLength; iii++) {
- readPos++;
+ updateIndel(indel, readPos - 1, mode, EventType.BASE_INSERTION);
}
+ readPos += elementLength;
if( !forwardStrandRead ) {
- indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
+ updateIndel(indel, readPos, mode, EventType.BASE_INSERTION);
}
break;
}
@@ -412,6 +409,12 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche
return indel;
}
+ private static void updateIndel(final int[] indel, final int index, final EventType mode, final EventType requiredMode) {
+ if ( mode == requiredMode && index >= 0 && index < indel.length )
+ // protect ourselves from events at the start or end of the read (1D3M or 3M1D)
+ indel[index] = 1;
+ }
+
protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) {
if(errorArray.length != baqArray.length ) {
throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
@@ -514,28 +517,13 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche
generateReport();
logger.info("...done!");
- if ( RAC.RECAL_PDF_FILE != null ) {
- logger.info("Generating recalibration plots...");
- generatePlots();
- }
-
- logger.info("Processed: " + result + " reads");
+ logger.info("BaseRecalibrator was able to recalibrate " + result + " reads");
}
private RecalibrationTables getRecalibrationTable() {
return recalibrationEngine.getFinalRecalibrationTables();
}
- private void generatePlots() {
- File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE;
- if (recalFile != null) {
- RecalibrationReport report = new RecalibrationReport(recalFile);
- RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), getRecalibrationTable(), requestedCovariates);
- }
- else
- RecalUtils.generateRecalibrationPlot(RAC, getRecalibrationTable(), requestedCovariates);
- }
-
/**
* go through the quality score table and use the # observations and the empirical quality score
* to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
index 0a4899f1c..b9f16132c 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
@@ -46,15 +46,17 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
+import com.google.java.contract.Requires;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
import java.io.File;
import java.io.PrintStream;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
/**
* Created by IntelliJ IDEA.
@@ -65,7 +67,7 @@ import java.util.List;
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
*/
-public class RecalibrationArgumentCollection {
+public class RecalibrationArgumentCollection implements Cloneable {
/**
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
@@ -87,21 +89,6 @@ public class RecalibrationArgumentCollection {
public File RECAL_TABLE_FILE = null;
public PrintStream RECAL_TABLE;
- /**
- * If not provided, then no plots will be generated (useful for queue scatter/gathering).
- * However, we *highly* recommend that users generate these plots whenever possible for QC checking.
- */
- @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false, defaultToStdout = false)
- public File RECAL_PDF_FILE = null;
-
- /**
- * If not provided, then a temporary file is created and then deleted upon completion.
- * For advanced users only.
- */
- @Advanced
- @Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false)
- public File RECAL_CSV_FILE = null;
-
/**
* Note that the --list argument requires a fully resolved and correct command-line to work.
*/
@@ -219,6 +206,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
+ @Hidden
+ @Argument(fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.")
+ public String FORCE_READGROUP = null;
+
@Hidden
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false)
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
@@ -278,11 +269,147 @@ public class RecalibrationArgumentCollection {
argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
argumentsTable.addRowID("recalibration_report", true);
argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath());
- argumentsTable.addRowID("plot_pdf_file", true);
- argumentsTable.set("plot_pdf_file", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RECAL_PDF_FILE == null ? "null" : RECAL_PDF_FILE.getAbsolutePath());
argumentsTable.addRowID("binary_tag_name", true);
argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME);
return argumentsTable;
}
+ /**
+ * Returns a map with the arguments that differ between this an
+ * another {@link RecalibrationArgumentCollection} instance.
+ *
+ * The key is the name of that argument in the report file. The value is a message
+ * that explains the difference to the end user.
+ *
+ * Thus, a empty map indicates that there is no differences between both argument collection that
+ * is relevant to report comparison.
+ *
+ * This method should not throw any exception.
+ *
+ * @param other the argument-collection to compare against.
+ * @param thisRole the name used to refer to this RAC report that makes sense to the end user.
+ * @param otherRole the name used to refer to the other RAC report that makes sense to the end user.
+ *
+ * @return never null, but a zero-size collection if there are no differences.
+ */
+ @Requires("other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)")
+ Map compareReportArguments(final RecalibrationArgumentCollection other,final String thisRole, final String otherRole) {
+ final Map result = new LinkedHashMap<>(15);
+ compareRequestedCovariates(result, other, thisRole, otherRole);
+ compareSimpleReportArgument(result,"no_standard_covs", DO_NOT_USE_STANDARD_COVARIATES, other.DO_NOT_USE_STANDARD_COVARIATES, thisRole, otherRole);
+ compareSimpleReportArgument(result,"run_without_dbsnp",RUN_WITHOUT_DBSNP,other.RUN_WITHOUT_DBSNP,thisRole,otherRole);
+ compareSimpleReportArgument(result,"solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE,thisRole,otherRole);
+ compareSimpleReportArgument(result,"solid_nocall_strategy", SOLID_NOCALL_STRATEGY, other.SOLID_NOCALL_STRATEGY,thisRole,otherRole);
+ compareSimpleReportArgument(result,"mismatches_context_size", MISMATCHES_CONTEXT_SIZE,other.MISMATCHES_CONTEXT_SIZE,thisRole,otherRole);
+ compareSimpleReportArgument(result,"mismatches_default_quality", MISMATCHES_DEFAULT_QUALITY, other.MISMATCHES_DEFAULT_QUALITY,thisRole,otherRole);
+ compareSimpleReportArgument(result,"deletions_default_quality", DELETIONS_DEFAULT_QUALITY, other.DELETIONS_DEFAULT_QUALITY,thisRole,otherRole);
+ compareSimpleReportArgument(result,"insertions_default_quality", INSERTIONS_DEFAULT_QUALITY, other.INSERTIONS_DEFAULT_QUALITY,thisRole,otherRole);
+ compareSimpleReportArgument(result,"maximum_cycle_value", MAXIMUM_CYCLE_VALUE, other.MAXIMUM_CYCLE_VALUE,thisRole,otherRole);
+ compareSimpleReportArgument(result,"low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL,thisRole,otherRole);
+ compareSimpleReportArgument(result,"default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM,thisRole,otherRole);
+ compareSimpleReportArgument(result,"force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM,thisRole,otherRole);
+ compareSimpleReportArgument(result,"quantizing_levels", QUANTIZING_LEVELS, other.QUANTIZING_LEVELS,thisRole,otherRole);
+ compareSimpleReportArgument(result,"binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME,thisRole,otherRole);
+ return result;
+ }
+
+
+ /**
+ * Compares the covariate report lists.
+ *
+ * @param diffs map where to annotate the difference.
+ * @param other the argument collection to compare against.
+ * @param thisRole the name for this argument collection that makes sense to the user.
+ * @param otherRole the name for the other argument collection that makes sense to the end user.
+ *
+ * @return true if a difference was found.
+ */
+ @Requires("diffs != null && other != null && thisRole != null && otherRole != null")
+ private boolean compareRequestedCovariates(final Map diffs,
+ final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) {
+
+ final Set beforeNames = new HashSet<>(this.COVARIATES.length);
+ final Set afterNames = new HashSet<>(other.COVARIATES.length);
+ Utils.addAll(beforeNames, this.COVARIATES);
+ Utils.addAll(afterNames,other.COVARIATES);
+ final Set intersect = new HashSet<>(Math.min(beforeNames.size(),afterNames.size()));
+ intersect.addAll(beforeNames);
+ intersect.retainAll(afterNames);
+
+ String diffMessage = null;
+ if (intersect.size() == 0) { // In practice this is not possible due to required covariates but...
+ diffMessage = String.format("There are no common covariates between '%s' and '%s'"
+ + " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",thisRole,otherRole,
+ thisRole,Utils.join(", ",this.COVARIATES),
+ otherRole,Utils.join(",",other.COVARIATES));
+ } else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) {
+ beforeNames.removeAll(intersect);
+ afterNames.removeAll(intersect);
+ diffMessage = String.format("There are differences in the set of covariates requested in the"
+ + " '%s' and '%s' recalibrator reports. "
+ + " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",thisRole,otherRole,
+ thisRole,Utils.join(", ",beforeNames),
+ otherRole,Utils.join(", ",afterNames));
+ }
+ if (diffMessage != null) {
+ diffs.put("covariate",diffMessage);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Annotates a map with any difference encountered in a simple value report argument that differs between this an
+ * another {@link RecalibrationArgumentCollection} instance.
+ *
+ * The key of the new entry would be the name of that argument in the report file. The value is a message
+ * that explains the difference to the end user.
+ *
+ *
+ *
+ * This method should not return any exception.
+ *
+ * @param diffs where to annotate the differences.
+ * @param name the name of the report argument to compare.
+ * @param thisValue this argument collection value for that argument.
+ * @param otherValue the other collection value for that argument.
+ * @param thisRole the name used to refer to this RAC report that makes sense to the end user.
+ * @param otherRole the name used to refer to the other RAC report that makes sense to the end user.
+ *
+ * @type T the argument Object value type.
+ *
+ * @return true if a difference has been spotted, thus diff has been modified.
+ */
+ private boolean compareSimpleReportArgument(final Map diffs,
+ final String name, final T thisValue, final T otherValue, final String thisRole, final String otherRole) {
+ if (thisValue == null && otherValue == null) {
+ return false;
+ } else if (thisValue != null && thisValue.equals(otherValue)) {
+ return false;
+ } else {
+ diffs.put(name,
+ String.format("differences between '%s' {%s} and '%s' {%s}.",
+ thisRole,thisValue == null ? "" : thisValue,
+ otherRole,otherValue == null ? "" : otherValue));
+ return true;
+ }
+
+ }
+
+ /**
+ * Create a shallow copy of this argument collection.
+ *
+ * @return never null.
+ */
+ @Override
+ public RecalibrationArgumentCollection clone() {
+ try {
+ return (RecalibrationArgumentCollection) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new StingException("Unreachable code clone not supported thrown when the class "
+ + this.getClass().getName() + " is cloneable ",e);
+ }
+ }
+
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
index 38b9e957b..ba2c2ae56 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
@@ -207,7 +207,7 @@ public class HeaderElement {
public void removeInsertionToTheRight() {
this.insertionsToTheRight--;
if (insertionsToTheRight < 0)
- throw new ReviewedStingException("Removed too many insertions, header is now negative!");
+ throw new ReviewedStingException("Removed too many insertions, header is now negative at position " + location);
}
public boolean hasInsertionToTheRight() {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
index 71910e566..e636f8f17 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
@@ -64,6 +64,7 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -236,6 +237,15 @@ public class ReduceReads extends ReadWalker, Redu
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
public int downsampleCoverage = 250;
+ /**
+ * Generally, this tool is not meant to be run for more than 1 sample at a time. The one valid exception
+ * brought to our attention by colleagues is the specific case of tumor/normal pairs in cancer analysis.
+ * To prevent users from unintentionally running the tool in a less than ideal manner, we require them
+ * to explicitly enable multi-sample analysis with this argument.
+ */
+ @Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "enable multi-samples reduction for cancer analysis", required = false)
+ public boolean ALLOW_MULTIPLE_SAMPLES = false;
+
@Hidden
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
public boolean nwayout = false;
@@ -263,8 +273,9 @@ public class ReduceReads extends ReadWalker, Redu
int nCompressedReads = 0;
- Object2LongOpenHashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
+ private static int READ_NAME_HASH_DEFAULT_SIZE = 1000;
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
+ Object2LongOpenHashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
ObjectSortedSet intervalList;
@@ -294,13 +305,16 @@ public class ReduceReads extends ReadWalker, Redu
if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 )
throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
+ if ( SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()).size() > 1 && !ALLOW_MULTIPLE_SAMPLES )
+ throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis");
+
if ( known.isEmpty() )
knownSnpPositions = null;
else
knownSnpPositions = new ObjectAVLTreeSet();
GenomeAnalysisEngine toolkit = getToolkit();
- readNameHash = new Object2LongOpenHashMap(100000); // prepare the read name hash to keep track of what reads have had their read names compressed
+ this.resetReadNameHash(); // prepare the read name hash to keep track of what reads have had their read names compressed
intervalList = new ObjectAVLTreeSet(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
if (toolkit.getIntervals() != null)
@@ -322,6 +336,16 @@ public class ReduceReads extends ReadWalker, Redu
}
}
+ /** Initializer for {@link #readNameHash}. */
+ private void resetReadNameHash() {
+ // If the hash grows large, subsequent clear operations can be very expensive, so trim the hash down if it grows beyond its default.
+ if (readNameHash == null || readNameHash.size() > READ_NAME_HASH_DEFAULT_SIZE) {
+ readNameHash = new Object2LongOpenHashMap(READ_NAME_HASH_DEFAULT_SIZE);
+ } else {
+ readNameHash.clear();
+ }
+ }
+
/**
* Takes in a read and prepares it for the SlidingWindow machinery by performing the
* following optional clipping operations:
@@ -458,7 +482,7 @@ public class ReduceReads extends ReadWalker, Redu
// stash.compress(), the readNameHash can be cleared after the for() loop above.
// The advantage of clearing the hash is that otherwise it holds all reads that have been encountered,
// which can use a lot of memory and cause RR to slow to a crawl and/or run out of memory.
- readNameHash.clear();
+ this.resetReadNameHash();
}
} else
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
index d3ca037be..0425af3df 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
@@ -877,6 +877,10 @@ public class SlidingWindow {
final int start = region.getStart() - windowHeaderStart;
int stop = region.getStop() - windowHeaderStart;
+ // make sure the bitset is complete given the region (it might not be in multi-sample mode)
+ if ( region.getStop() > markedSites.getStartLocation() + markedSites.getVariantSiteBitSet().length )
+ markSites(region.getStop());
+
CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions);
allReads.addAll(closeVariantRegionResult.reads);
@@ -1195,7 +1199,7 @@ public class SlidingWindow {
}
// Special case for leading insertions before the beginning of the sliding read
- if ( ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == headerStart || headerStart < 0) ) {
+ if ( (readStart == headerStart || headerStart < 0) && ReadUtils.readStartsWithInsertion(read.getCigar(), false) != null ) {
// create a new first element to the window header with no bases added
header.addFirst(new HeaderElement(readStart - 1));
// this allows the first element (I) to look at locationIndex - 1 when we update the header and do the right thing
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java
index dca83af44..ceccdcb2e 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java
@@ -63,6 +63,10 @@ abstract class AbstractStratification {
private Map statusTally = null;
protected ThresHolder thresholds;
+ public AbstractStratification(ThresHolder thresholds) {
+ this.thresholds = thresholds;
+ }
+
/**
* Calculates the average "good" coverage of this sample. Good means "passes the base and
* mapping quality requirements.
@@ -116,11 +120,11 @@ abstract class AbstractStratification {
*
* @return the callable status(es) for the whole object
*/
- public abstract Iterable callableStatuses();
+ public abstract List callableStatuses();
/**
- * Tally up all the callable status of all the loci in this sample.
+ * Tally up all the callable status of all elements of the stratification.
*
* @return a map of callable status and counts
*/
@@ -136,10 +140,10 @@ abstract class AbstractStratification {
return statusTally;
}
- public static List queryStatus(List statList, AbstractStratification stratification) {
+ public List queryStatus(List statList) {
List output = new LinkedList();
for (Metric stat : statList) {
- final CallableStatus status = stat.status(stratification);
+ final CallableStatus status = stat.status(this);
if (status != null) {
output.add(status);
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java
index 32f87b973..bde324e3c 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java
@@ -52,6 +52,7 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
@@ -65,6 +66,7 @@ import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.variant.vcf.*;
+import java.io.PrintStream;
import java.util.*;
/**
@@ -109,9 +111,13 @@ import java.util.*;
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@By(value = DataSource.READS)
@PartitionBy(PartitionType.INTERVAL)
+@Downsample(by = DownsampleType.NONE)
public class DiagnoseTargets extends LocusWalker {
private static final String AVG_INTERVAL_DP_KEY = "IDP";
+ private static final String LOW_COVERAGE_LOCI = "LL";
+ private static final String ZERO_COVERAGE_LOCI = "ZL";
+
@Output(doc = "File to which interval statistics should be written")
private VariantContextWriter vcfWriter = null;
@@ -119,13 +125,12 @@ public class DiagnoseTargets extends LocusWalker {
@ArgumentCollection
private ThresHolder thresholds = new ThresHolder();
- private Map intervalMap = null; // maps each interval => statistics
+ private Map intervalMap = null; // maps each interval => statistics
private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
private Set samples = null; // all the samples being processed
private static final Allele SYMBOLIC_ALLELE = Allele.create("", false); // avoid creating the symbolic allele multiple times
private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
-
- private static final int INITIAL_HASH_SIZE = 500000;
+ private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals
@Override
public void initialize() {
@@ -134,7 +139,7 @@ public class DiagnoseTargets extends LocusWalker {
if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty())
throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead.");
- intervalMap = new HashMap(INITIAL_HASH_SIZE);
+ intervalMap = new LinkedHashMap(INITIAL_HASH_SIZE);
intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator());
// get all of the unique sample names for the VCF Header
@@ -146,13 +151,13 @@ public class DiagnoseTargets extends LocusWalker {
}
@Override
- public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ public Long map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
GenomeLoc refLocus = ref.getLocus();
// process and remove any intervals in the map that are don't overlap the current locus anymore
// and add all new intervals that may overlap this reference locus
- outputFinishedIntervals(refLocus, ref.getBase());
addNewOverlappingIntervals(refLocus);
+ outputFinishedIntervals(refLocus, ref.getBase());
// at this point, all intervals in intervalMap overlap with this locus, so update all of them
for (IntervalStratification intervalStratification : intervalMap.values())
@@ -184,7 +189,7 @@ public class DiagnoseTargets extends LocusWalker {
* @param result number of loci processed by the walker
*/
@Override
- public void onTraversalDone(Long result) {
+ public void onTraversalDone(final Long result) {
for (GenomeLoc interval : intervalMap.keySet())
outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
@@ -194,6 +199,10 @@ public class DiagnoseTargets extends LocusWalker {
intervalListIterator.next();
interval = intervalListIterator.peek();
}
+
+ if (thresholds.missingTargets != null) {
+ thresholds.missingTargets.close();
+ }
}
/**
@@ -203,24 +212,21 @@ public class DiagnoseTargets extends LocusWalker {
* @param refBase the reference allele
*/
private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
- GenomeLoc interval = intervalListIterator.peek();
-
- // output empty statistics for uncovered intervals
- while (interval != null && interval.isBefore(refLocus)) {
- final IntervalStratification stats = intervalMap.get(interval);
- outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE);
- if (stats != null) intervalMap.remove(interval);
- intervalListIterator.next();
- interval = intervalListIterator.peek();
- }
-
- // remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals)
+ // output any intervals that were finished
+ final List toRemove = new LinkedList();
for (GenomeLoc key : intervalMap.keySet()) {
if (key.isBefore(refLocus)) {
- outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true));
- intervalMap.remove(key);
+ final IntervalStratification intervalStats = intervalMap.get(key);
+ outputStatsToVCF(intervalStats, Allele.create(refBase, true));
+ if (hasMissingLoci(intervalStats)) {
+ outputMissingInterval(intervalStats);
+ }
+ toRemove.add(key);
}
}
+ for (GenomeLoc key : toRemove) {
+ intervalMap.remove(key);
+ }
}
/**
@@ -228,7 +234,7 @@ public class DiagnoseTargets extends LocusWalker {
*
* @param refLocus the current reference locus
*/
- private void addNewOverlappingIntervals(GenomeLoc refLocus) {
+ private void addNewOverlappingIntervals(final GenomeLoc refLocus) {
GenomeLoc interval = intervalListIterator.peek();
while (interval != null && !interval.isPast(refLocus)) {
intervalMap.put(interval, createIntervalStatistic(interval));
@@ -243,14 +249,24 @@ public class DiagnoseTargets extends LocusWalker {
* @param stats The statistics of the interval
* @param refAllele the reference allele
*/
- private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) {
+ private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
GenomeLoc interval = stats.getInterval();
+ final List alleles = new ArrayList();
+ final Map attributes = new HashMap();
+ final ArrayList genotypes = new ArrayList();
- List alleles = new ArrayList();
- Map attributes = new HashMap();
- ArrayList genotypes = new ArrayList();
+ for (String sample : samples) {
+ final GenotypeBuilder gb = new GenotypeBuilder(sample);
+ SampleStratification sampleStat = stats.getSampleStatistics(sample);
+ gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
+ gb.attribute(LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci());
+ gb.attribute(ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci());
+ gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
+
+ genotypes.add(gb.make());
+ }
alleles.add(refAllele);
alleles.add(SYMBOLIC_ALLELE);
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles);
@@ -262,21 +278,56 @@ public class DiagnoseTargets extends LocusWalker {
attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
vcb = vcb.attributes(attributes);
- for (String sample : samples) {
- final GenotypeBuilder gb = new GenotypeBuilder(sample);
-
- SampleStratification sampleStat = stats.getSampleStatistics(sample);
- gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
-
- gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
-
- genotypes.add(gb.make());
- }
vcb = vcb.genotypes(genotypes);
vcfWriter.add(vcb.make());
}
+ private boolean hasMissingStatuses(AbstractStratification stats) {
+ return !stats.callableStatuses().isEmpty();
+ }
+
+ private boolean hasMissingLoci(final IntervalStratification stats) {
+ return thresholds.missingTargets != null && hasMissingStatuses(stats);
+ }
+
+ private void outputMissingInterval(final IntervalStratification stats) {
+ final GenomeLoc interval = stats.getInterval();
+ final boolean missing[] = new boolean[interval.size()];
+ Arrays.fill(missing, true);
+ for (AbstractStratification sample : stats.getElements()) {
+ if (hasMissingStatuses(sample)) {
+ int pos = 0;
+ for (AbstractStratification locus : sample.getElements()) {
+ if (locus.callableStatuses().isEmpty()) {
+ missing[pos] = false;
+ }
+ pos++;
+ }
+ }
+ }
+ int start = -1;
+ boolean insideMissing = false;
+ for (int i = 0; i < missing.length; i++) {
+ if (missing[i] && !insideMissing) {
+ start = interval.getStart() + i;
+ insideMissing = true;
+ } else if (!missing[i] && insideMissing) {
+ final int stop = interval.getStart() + i - 1;
+ outputMissingInterval(interval.getContig(), start, stop);
+ insideMissing = false;
+ }
+ }
+ if (insideMissing) {
+ outputMissingInterval(interval.getContig(), start, interval.getStop());
+ }
+ }
+
+ private void outputMissingInterval(final String contig, final int start, final int stop) {
+ final PrintStream out = thresholds.missingTargets;
+ out.println(String.format("%s:%d-%d", contig, start, stop));
+ }
+
/**
* Function that process a set of statuses into strings
*
@@ -345,6 +396,8 @@ public class DiagnoseTargets extends LocusWalker {
// FORMAT fields for each genotype
headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size."));
+ headerLines.add(new VCFFormatHeaderLine(LOW_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with low coverage (below the minimum coverage) but not zero."));
+ headerLines.add(new VCFFormatHeaderLine(ZERO_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with zero coverage."));
// FILTER fields
for (CallableStatus stat : CallableStatus.values())
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java
index 6c20403d1..3b5a23d51 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java
@@ -56,11 +56,11 @@ import java.util.*;
final class IntervalStratification extends AbstractStratification {
private final Map samples;
private final GenomeLoc interval;
- private final ThresHolder thresholds;
+ private List callableStatuses;
public IntervalStratification(Set samples, GenomeLoc interval, ThresHolder thresholds) {
+ super(thresholds);
this.interval = interval;
- this.thresholds = thresholds;
this.samples = new HashMap(samples.size());
for (String sample : samples)
this.samples.put(sample, new SampleStratification(interval, thresholds));
@@ -114,7 +114,13 @@ final class IntervalStratification extends AbstractStratification {
* {@inheritDoc}
*/
@Override
- public Iterable callableStatuses() {
+ public List callableStatuses() {
+ if (callableStatuses == null)
+ callableStatuses = calculateStatus();
+ return callableStatuses;
+ }
+
+ private List calculateStatus() {
final List output = new LinkedList();
// check if any of the votes pass the threshold
@@ -125,7 +131,7 @@ final class IntervalStratification extends AbstractStratification {
}
}
- output.addAll(queryStatus(thresholds.intervalMetricList, this));
+ output.addAll(queryStatus(thresholds.intervalMetricList));
return output;
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java
index d6acaf850..5902fce31 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java
@@ -46,22 +46,20 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
-import java.util.LinkedList;
import java.util.List;
final class LocusStratification extends AbstractStratification {
private long coverage;
private long rawCoverage;
- private final List locusStatisticsList;
public LocusStratification(ThresHolder thresholds) {
this(0,0,thresholds);
}
protected LocusStratification(int coverage, int rawCoverage, ThresHolder thresholds) {
+ super(thresholds);
this.coverage = coverage;
this.rawCoverage = rawCoverage;
- this.locusStatisticsList = thresholds.locusMetricList;
}
@Override
@@ -79,14 +77,7 @@ final class LocusStratification extends AbstractStratification {
* @return a set of all statuses that apply
*/
public List callableStatuses() {
- List output = new LinkedList();
- for (Metric stats : locusStatisticsList) {
- CallableStatus status = stats.status(this);
- if (status != null) {
- output.add(status);
- }
- }
- return output;
+ return queryStatus(thresholds.locusMetricList);
}
@Override
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java
index 1085e8cac..7984ba7e7 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java
@@ -58,6 +58,6 @@ final class PluginUtils {
final Map totals = sampleStratification.getStatusTally();
final int size = sampleStratification.getIntervalSize();
final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0;
- return ( (double) statusCount / size) >= threshold ? CALL: null;
+ return ( (double) statusCount / size) > threshold ? CALL: null;
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java
index b9ae1f3cf..0f84c7d22 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java
@@ -61,15 +61,14 @@ import java.util.List;
final class SampleStratification extends AbstractStratification {
private final GenomeLoc interval;
private final ArrayList loci;
- private final ThresHolder thresholds;
private int nReads = -1;
private int nBadMates = -1;
public SampleStratification(final GenomeLoc interval, final ThresHolder thresholds) {
+ super(thresholds);
this.interval = interval;
this.loci = new ArrayList(interval.size());
- this.thresholds = thresholds;
nReads = 0;
nBadMates = 0;
@@ -118,10 +117,10 @@ final class SampleStratification extends AbstractStratification {
* {@inheritDoc}
*/
@Override
- public Iterable callableStatuses() {
+ public List callableStatuses() {
final List output = new LinkedList();
- // get the tally of all the locus callable statuses
+ // get the sample statuses of all the Loci Metrics
for (Metric locusStat : thresholds.locusMetricList) {
final CallableStatus status = ((LocusMetric) locusStat).sampleStatus(this);
if (status != null) {
@@ -130,12 +129,7 @@ final class SampleStratification extends AbstractStratification {
}
// get the sample specific statitics statuses
- for (Metric sampleStat : thresholds.sampleMetricList) {
- final CallableStatus status = sampleStat.status(this);
- if (status != null) {
- output.add(status);
- }
- }
+ output.addAll(queryStatus(thresholds.sampleMetricList));
// special case, if there are no reads, then there is no sense reporting coverage gaps.
if (output.contains(CallableStatus.NO_READS) && output.contains(CallableStatus.COVERAGE_GAPS))
@@ -159,4 +153,17 @@ final class SampleStratification extends AbstractStratification {
read.setTemporaryAttribute("seen", true);
}
}
+
+ public int getNLowCoveredLoci() {
+ return getCallableStatusCount(CallableStatus.LOW_COVERAGE);
+ }
+
+ public int getNUncoveredLoci() {
+ return getCallableStatusCount(CallableStatus.COVERAGE_GAPS);
+ }
+
+ private int getCallableStatusCount(CallableStatus status) {
+ final Integer x = getStatusTally().get(status);
+ return x == null ? 0 : x;
+ }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java
index b0c999460..a6cbc1da3 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java
@@ -47,7 +47,9 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import java.io.PrintStream;
import java.util.LinkedList;
import java.util.List;
@@ -114,6 +116,9 @@ final class ThresHolder {
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false)
public double qualityStatusThreshold = 0.50;
+ @Output(fullName = "missing_intervals", shortName = "missing", defaultToStdout = false, doc ="Produces a file with the intervals that don't pass filters", required = false)
+ public PrintStream missingTargets = null;
+
public final List locusMetricList = new LinkedList();
public final List sampleMetricList = new LinkedList();
public final List intervalMetricList = new LinkedList();
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java
new file mode 100644
index 000000000..5e3da5f4f
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java
@@ -0,0 +1,110 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics.missing;
+
+/**
+ * Short one line description of the walker.
+ *
+ *
+ * [Long description of the walker]
+ *
+ *
+ *
+ * Input
+ *
+ * [Description of the Input]
+ *
+ *
+ * Output
+ *
+ * [Description of the Output]
+ *
+ *
+ * Examples
+ *
+ * java
+ * -jar GenomeAnalysisTK.jar
+ * -T [walker name]
+ *
+ *
+ * @author Mauricio Carneiro
+ * @since 5/1/13
+ */
+final class Metrics {
+ private double gccontent;
+ private double baseQual;
+ private double mapQual;
+ private int reads;
+ private int refs;
+
+ void reads(int reads) {this.reads = reads;}
+ void refs(int refs) {this.refs = refs;}
+
+ void gccontent(double gccontent) {this.gccontent = gccontent;}
+ void baseQual(double baseQual) {this.baseQual = baseQual;}
+ void mapQual(double mapQual) {this.mapQual = mapQual;}
+
+ double gccontent() {return refs > 0 ? gccontent/refs : 0.0;}
+ double baseQual() {return reads > 0 ? baseQual/reads : 0.0;}
+ double mapQual() {return reads > 0 ? mapQual/reads : 0.0;}
+
+ /**
+ * Combines two metrics
+ *
+ * @param value the other metric to combine
+ * @return itself, for simple reduce
+ */
+ public Metrics combine(Metrics value) {
+ this.gccontent += value.gccontent;
+ this.baseQual += value.baseQual;
+ this.mapQual += value.mapQual;
+ this.reads += value.reads;
+ this.refs += value.refs;
+
+ return this;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java
new file mode 100644
index 000000000..62716d6d2
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java
@@ -0,0 +1,226 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics.missing;
+
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.CommandLineGATK;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.report.GATKReport;
+import org.broadinstitute.sting.gatk.walkers.By;
+import org.broadinstitute.sting.gatk.walkers.DataSource;
+import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.walkers.NanoSchedulable;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.GenomeLocSortedSet;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.help.HelpConstants;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.text.XReadLines;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.util.List;
+
+/**
+ * Walks along reference and calculates a few metrics for each interval.
+ *
+ * Metrics:
+ *
+ * - Average Base Quality
+ * - Average Mapping Quality
+ * - GC Content
+ * - Position in the target
+ * - Coding Sequence / Intron
+ * - Length of the uncovered area
+ *
+ *
+ * Input
+ *
+ * A reference file
+ *
+ *
+ * Output
+ *
+ * GC content calculations per interval.
+ *
+ *
+ * Example
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -T QualifyMissingIntervals \
+ * -R ref.fasta \
+ * -o output.grp \
+ * -L input.intervals \
+ * -cds cds.intervals \
+ * -targets targets.intervals
+ *
+ *
+ */
+@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
+@By(DataSource.REFERENCE)
+public final class QualifyMissingIntervals extends LocusWalker implements NanoSchedulable {
+ @Output
+ protected PrintStream out;
+
+ @Argument(shortName = "targets", required = true)
+ public File targetsFile;
+
+ @Argument(shortName = "cds", required = false)
+ public File cdsFile;
+
+ GATKReport simpleReport;
+ GenomeLocSortedSet target;
+ GenomeLocSortedSet cds;
+
+ public boolean isReduceByInterval() {
+ return true;
+ }
+
+ public void initialize() {
+ simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "TP", "CD", "LN");
+ final GenomeLocParser parser = getToolkit().getGenomeLocParser();
+ target = new GenomeLocSortedSet(parser);
+ cds = new GenomeLocSortedSet(parser);
+ parseFile(targetsFile, target, parser);
+ parseFile(cdsFile, cds, parser);
+ }
+
+ public Metrics reduceInit() {
+ return new Metrics();
+ }
+
+ public Metrics map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ if (tracker == null)
+ return null;
+
+ final Metrics metrics = new Metrics();
+ final byte baseIndex = ref.getBase();
+ final ReadBackedPileup pileup = context.getBasePileup();
+ final int nBases = pileup.getNumberOfElements();
+
+ double baseQual = 0.0;
+ for (byte qual : pileup.getQuals()) {
+ baseQual += qual;
+ }
+ double mapQual = 0.0;
+ for (byte qual : pileup.getMappingQuals()) {
+ mapQual += qual;
+ }
+
+ metrics.baseQual(baseQual);
+ metrics.mapQual(mapQual);
+ metrics.gccontent(baseIndex == 'C' || baseIndex == 'G' ? 1.0 : 0.0);
+ metrics.reads(nBases);
+ metrics.refs(1);
+
+ return metrics;
+ }
+
+ @Override
+ public Metrics reduce(Metrics value, Metrics sum) {
+ return sum.combine(value);
+ }
+
+ public void onTraversalDone(List> results) {
+ for (Pair r : results) {
+ GenomeLoc interval = r.getFirst();
+ Metrics metrics = r.getSecond();
+ simpleReport.addRow(
+ interval.toString(),
+ metrics.gccontent(),
+ metrics.baseQual(),
+ metrics.mapQual(),
+ getPositionInTarget(interval),
+ cds.overlaps(interval),
+ interval.size()
+ );
+ }
+ simpleReport.print(out);
+ out.close();
+ }
+
+ private static GenomeLoc parseInterval(String s, GenomeLocParser parser) {
+ if (s.isEmpty()) {
+ return null;
+ }
+ String[] first = s.split(":");
+ if (first.length == 2) {
+ String[] second = first[1].split("\\-");
+ return parser.createGenomeLoc(first[0], Integer.decode(second[0]), Integer.decode(second[1]));
+ } else {
+ throw new UserException.BadInput("Interval doesn't parse correctly: " + s);
+ }
+ }
+
+ private void parseFile(File file, GenomeLocSortedSet set, GenomeLocParser parser) {
+ try {
+ for (String s : new XReadLines(file) ) {
+ GenomeLoc interval = parseInterval(s, parser);
+ if (interval != null)
+ set.add(interval, true);
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private int getPositionInTarget(GenomeLoc interval) {
+ final List hits = target.getOverlapping(interval);
+ int result = 0;
+ for (GenomeLoc hit : hits) {
+ result = interval.getStart() - hit.getStart(); // if there are multiple hits, we'll get the last one.
+ }
+ return result;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
index c6e9ea379..0f3f7739d 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@@ -76,7 +76,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private List alleleList = new ArrayList();
- protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
+ protected IndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC,
+ final Logger logger) {
super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
@@ -85,10 +86,11 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
}
- protected static List computeConsensusAlleles(ReferenceContext ref,
- Map contexts,
- AlignmentContextUtils.ReadOrientation contextType,
- GenomeLocParser locParser, UnifiedArgumentCollection UAC) {
+ protected static List computeConsensusAlleles(final ReferenceContext ref,
+ final Map contexts,
+ final AlignmentContextUtils.ReadOrientation contextType,
+ final GenomeLocParser locParser,
+ final UnifiedArgumentCollection UAC) {
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
return counter.computeConsensusAlleles(ref, contexts, contextType);
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
index ce5f94478..360f88e51 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
@@ -147,9 +147,17 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
// if we only want variants, then we don't need to calculate genotype likelihoods
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
return builder.make();
+ // if user requires all PLs at all sites, add all possible alt alleles
+ else if (UAC.annotateAllSitesWithPLs) {
+ for ( final byte base : BaseUtils.BASES ) {
+ if ( base != refBase )
+ alleles.add(Allele.create(base));
+ }
+ }
- // otherwise, choose any alternate allele (it doesn't really matter)
- alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
+ else
+ // otherwise, choose any alternate allele (it doesn't really matter)
+ alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index e346b10b7..f156468cc 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -52,6 +52,9 @@ import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.VariantContext;
+import java.util.Collections;
+import java.util.List;
+
public class UnifiedArgumentCollection extends StandardCallerArgumentCollection {
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
@@ -82,7 +85,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
* The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
*/
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false)
- public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL;
+ public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
/**
* The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base
@@ -95,6 +98,18 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
+ /**
+ * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
+ * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
+ * WARNINGS:
+ * - This feature will inflate VCF file size considerably.
+ * - All SNP ALT alleles will be emitted with corresponding 10 PL values.
+ * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
+ */
+ @Advanced
+ @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)
+ public boolean annotateAllSitesWithPLs = false;
+
// indel-related arguments
/**
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
@@ -247,7 +262,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES;
this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO;
this.pairHMM = uac.pairHMM;
-
+ this.annotateAllSitesWithPLs = uac.annotateAllSitesWithPLs;
// todo- arguments to remove
this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES;
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
index 3380efcc9..9f3368cf8 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@@ -83,6 +83,9 @@ public class UnifiedGenotyperEngine {
public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
+ private static final int SNP_MODEL = 0;
+ private static final int INDEL_MODEL = 1;
+
public enum OUTPUT_MODE {
/** produces calls only at variant sites */
EMIT_VARIANTS_ONLY,
@@ -165,6 +168,13 @@ public class UnifiedGenotyperEngine {
filter.add(LOW_QUAL_FILTER_NAME);
determineGLModelsToUse();
+
+ // do argument checking
+ if (UAC.annotateAllSitesWithPLs) {
+ if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP))
+ throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs");
+
+ }
}
/**
@@ -436,7 +446,8 @@ public class UnifiedGenotyperEngine {
bestGuessIsRef = false;
}
// if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
- else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
+ else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ||
+ UAC.annotateAllSitesWithPLs) {
myAlleles.add(alternateAllele);
alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
}
@@ -446,7 +457,7 @@ public class UnifiedGenotyperEngine {
// note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
final double phredScaledConfidence =
- Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
+ Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs
? -10 * AFresult.getLog10PosteriorOfAFEq0()
: -10 * AFresult.getLog10PosteriorOfAFGT0());
@@ -540,11 +551,6 @@ public class UnifiedGenotyperEngine {
builder.attributes(attributes);
VariantContext vcCall = builder.make();
- // if we are subsetting alleles (either because there were too many or because some were not polymorphic)
- // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
- if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
- vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall);
-
if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup();
@@ -553,6 +559,11 @@ public class UnifiedGenotyperEngine {
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
}
+ // if we are subsetting alleles (either because there were too many or because some were not polymorphic)
+ // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
+ if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
+ vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall);
+
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
}
@@ -693,13 +704,13 @@ public class UnifiedGenotyperEngine {
}
private void determineGLModelsToUse() {
-
String modelPrefix = "";
if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY )
modelPrefix = GPSTRING;
- if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) {
- modelPrefix += UAC.GLmodel.name().toUpperCase().replaceAll("BOTH","");
+ // GGA mode => must initialize both the SNP and indel models
+ if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ||
+ UAC.GLmodel.name().toUpperCase().contains("BOTH") ) {
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP"));
modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL"));
}
@@ -712,31 +723,24 @@ public class UnifiedGenotyperEngine {
private List getGLModelsToUse(final RefMetaDataTracker tracker,
final ReferenceContext refContext,
final AlignmentContext rawContext) {
-
if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES )
return modelsToUse;
+ if ( modelsToUse.size() != 2 )
+ throw new IllegalStateException("GGA mode assumes that we have initialized both the SNP and indel models but found " + modelsToUse);
+
// if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested
- final List GGAmodel = new ArrayList(1);
final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
- if ( vcInput == null )
- return GGAmodel; // no work to be done
- if ( vcInput.isSNP() ) {
- // use the SNP model unless the user chose INDEL mode only
- if ( modelsToUse.size() == 2 || modelsToUse.get(0).name().endsWith("SNP") )
- GGAmodel.add(modelsToUse.get(0));
+ if ( vcInput == null ) {
+ return Collections.emptyList(); // no work to be done
+ } else if ( vcInput.isSNP() ) {
+ return Collections.singletonList(modelsToUse.get(SNP_MODEL));
+ } else if ( vcInput.isIndel() || vcInput.isMixed() ) {
+ return Collections.singletonList(modelsToUse.get(INDEL_MODEL));
+ } else {
+ return Collections.emptyList(); // No support for other types yet
}
- else if ( vcInput.isIndel() || vcInput.isMixed() ) {
- // use the INDEL model unless the user chose SNP mode only
- if ( modelsToUse.size() == 2 )
- GGAmodel.add(modelsToUse.get(1));
- else if ( modelsToUse.get(0).name().endsWith("INDEL") )
- GGAmodel.add(modelsToUse.get(0));
- }
- // No support for other types yet
-
- return GGAmodel;
}
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java
index 170b6e250..2ece18002 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java
@@ -106,7 +106,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles()));
builder.alleles(alleles);
- builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
+ builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL));
return builder.make();
} else {
return vc;
@@ -352,6 +352,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
final List allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
- return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
+ return allelesToUse.size() == 1
+ ? GATKVariantContextUtils.subsetToRefOnly(vc, ploidy)
+ : GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse,
+ assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL);
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java
new file mode 100644
index 000000000..063e3b218
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java
@@ -0,0 +1,142 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.TreeSet;
+
+/**
+ * Trim down an active region based on a set of variants found across the haplotypes within the region
+ *
+ * User: depristo
+ * Date: 4/27/13
+ * Time: 2:10 PM
+ */
+class ActiveRegionTrimmer {
+ private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class);
+ private final boolean logTrimming;
+ private final int snpPadding, nonSnpPadding, maxDistanceInExtensionForGenotyping;
+ private final GenomeLocParser parser;
+
+ /**
+ * Create a new ActiveRegionTrimmer
+ *
+ * @param logTrimming should we log our trimming events?
+ * @param snpPadding how much bp context should we ensure around snps?
+ * @param nonSnpPadding how much bp context should we ensure around anything not a snp?
+ * @param maxDistanceInExtensionForGenotyping the max extent we are will to go into the extended region of the
+ * origin active region in order to properly genotype events in the
+ * non-extended active region?
+ * @param parser a genome loc parser so we can create genome locs
+ */
+ ActiveRegionTrimmer(boolean logTrimming, int snpPadding, int nonSnpPadding, int maxDistanceInExtensionForGenotyping, GenomeLocParser parser) {
+ if ( snpPadding < 0 ) throw new IllegalArgumentException("snpPadding must be >= 0 but got " + snpPadding);
+ if ( nonSnpPadding < 0 ) throw new IllegalArgumentException("nonSnpPadding must be >= 0 but got " + nonSnpPadding);
+ if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping);
+ if ( parser == null ) throw new IllegalArgumentException("parser cannot be null");
+
+ this.logTrimming = logTrimming;
+ this.snpPadding = snpPadding;
+ this.nonSnpPadding = nonSnpPadding;
+ this.maxDistanceInExtensionForGenotyping = maxDistanceInExtensionForGenotyping;
+ this.parser = parser;
+ }
+
+ /**
+ * Trim down the active region to a region large enough to properly genotype the events found within the active
+ * region span, excluding all variants that only occur within its extended span.
+ *
+ * This function merely creates the region, but it doesn't populate the reads back into the region.
+ *
+ * @param region our full active region
+ * @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position
+ * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully
+ */
+ public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion) {
+ if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region
+ return null;
+
+ final List withinActiveRegion = new LinkedList();
+ int pad = snpPadding;
+ GenomeLoc trimLoc = null;
+ for ( final VariantContext vc : allVariantsWithinExtendedRegion ) {
+ final GenomeLoc vcLoc = parser.createGenomeLoc(vc);
+ if ( region.getLocation().overlapsP(vcLoc) ) {
+ if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding
+ pad = nonSnpPadding;
+ trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc);
+ withinActiveRegion.add(vc);
+ }
+ }
+
+ // we don't actually have anything in the region after removing variants that don't overlap the region's full location
+ if ( trimLoc == null ) return null;
+
+ final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping);
+ final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad);
+ final GenomeLoc finalSpan = maxSpan.intersect(idealSpan);
+
+ final ActiveRegion trimmedRegion = region.trim(finalSpan);
+ if ( logTrimming ) {
+ logger.info("events : " + withinActiveRegion);
+ logger.info("trimLoc : " + trimLoc);
+ logger.info("pad : " + pad);
+ logger.info("idealSpan : " + idealSpan);
+ logger.info("maxSpan : " + maxSpan);
+ logger.info("finalSpan : " + finalSpan);
+ logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size());
+ }
+ return trimmedRegion;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
index 12a4841bf..d876a403b 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
@@ -46,102 +46,55 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
-import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
-import net.sf.samtools.Cigar;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
-import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
-import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
-import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
import org.broadinstitute.sting.utils.MathUtils;
-import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
-import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
-import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
-import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
-import org.broadinstitute.variant.variantcontext.Allele;
-import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
-import java.util.*;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
/**
- * Created by IntelliJ IDEA.
+ * DeBruijn assembler for the HaplotypeCaller
+ *
* User: ebanks, rpoplin
* Date: Mar 14, 2011
*/
-
public class DeBruijnAssembler extends LocalAssemblyEngine {
private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);
- private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
-
// TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should
// TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where
// TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases
- private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25;
+ private final static int NUM_PATHS_PER_GRAPH = 25;
+ private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int GRAPH_KMER_STEP = 6;
+ private static final int GGA_MODE_ARTIFICIAL_COUNTS = 1000;
- private final boolean debug;
- private final boolean debugGraphTransformations;
private final int minKmer;
- private final boolean allowCyclesInKmerGraphToGeneratePaths;
-
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
-
protected DeBruijnAssembler() {
- this(false, -1, 11, false);
+ this(25, -1);
}
- public DeBruijnAssembler(final boolean debug,
- final int debugGraphTransformations,
- final int minKmer,
- final boolean allowCyclesInKmerGraphToGeneratePaths) {
- super();
- this.debug = debug;
- this.debugGraphTransformations = debugGraphTransformations > 0;
- this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations;
+ public DeBruijnAssembler(final int minKmer, final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) {
+ super(NUM_PATHS_PER_GRAPH);
this.minKmer = minKmer;
- this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths;
+ this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
}
- /**
- * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
- * @param activeRegion ActiveRegion object holding the reads which are to be used during assembly
- * @param refHaplotype reference haplotype object
- * @param fullReferenceWithPadding byte array holding the reference sequence with padding
- * @param refLoc GenomeLoc object corresponding to the reference sequence with padding
- * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
- * @return a non-empty list of all the haplotypes that are produced during assembly
- */
- @Ensures({"result.contains(refHaplotype)"})
- public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype ) {
- if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
- if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
- if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
- if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
-
- // create the graphs
- final List graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
-
- // print the graphs if the appropriate debug option has been turned on
- if( graphWriter != null ) {
- printGraphs(graphs);
- }
-
- // find the best paths in the graphs and return them as haplotypes
- return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
- }
-
- @Requires({"reads != null", "refHaplotype != null"})
- protected List createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) {
- final List graphs = new LinkedList();
+ @Override
+ protected List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) {
+ final List graphs = new LinkedList<>();
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
if( maxKmer < minKmer) {
@@ -154,7 +107,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
continue;
if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
- DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype);
+ DeBruijnGraph graph = createGraphFromSequences(reads, kmer, refHaplotype, activeAlleleHaplotypes);
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
@@ -165,10 +118,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
" future subsystem will actually go and error correct the reads");
}
- final SeqGraph seqGraph = toSeqGraph(graph);
+ final SeqGraph seqGraph = cleanupSeqGraph(graph.convertToSequenceGraph());
if ( seqGraph != null ) { // if the graph contains interesting variation from the reference
- sanityCheckReferenceGraph(seqGraph, refHaplotype);
graphs.add(seqGraph);
if ( debugGraphTransformations ) // we only want to use one graph size
@@ -181,71 +133,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
return graphs;
}
- private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) {
- final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
-
- // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm
- // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect
- // TODO -- to anything from one that's actually has good support along the chain but just happens
- // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately
- // TODO -- the pruning algorithm really should be an error correction algorithm that knows more
- // TODO -- about the structure of the data and can differentiate between an infrequent path but
- // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through)
- // TODO -- from a error with lots of weight going along another similar path
- // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
- seqGraph.zipLinearChains();
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
-
- // now go through and prune the graph, removing vertices no longer connected to the reference chain
- // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
- // edges to maintain graph connectivity.
- seqGraph.pruneGraph(pruneFactor);
- seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
-
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
- seqGraph.simplifyGraph();
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
-
- // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
- // happen in cases where for example the reference somehow manages to acquire a cycle, or
- // where the entire assembly collapses back into the reference sequence.
- if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
- return null;
-
- seqGraph.removePathsNotConnectedToRef();
- seqGraph.simplifyGraph();
- if ( seqGraph.vertexSet().size() == 1 ) {
- // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop
- // the code from blowing up.
- // TODO -- ref properties should really be on the vertices, not the graph itself
- final SeqVertex complete = seqGraph.vertexSet().iterator().next();
- final SeqVertex dummy = new SeqVertex("");
- seqGraph.addVertex(dummy);
- seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
- }
- if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
-
- return seqGraph;
- }
-
- protected void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) {
- if( graph.getReferenceSourceVertex() == null ) {
- throw new IllegalStateException("All reference graphs must have a reference source vertex.");
- }
- if( graph.getReferenceSinkVertex() == null ) {
- throw new IllegalStateException("All reference graphs must have a reference sink vertex.");
- }
- if( !Arrays.equals(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true), refHaplotype.getBases()) ) {
- throw new IllegalStateException("Mismatch between the reference haplotype and the reference assembly graph path." +
- " graph = " + new String(graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true)) +
- " haplotype = " + new String(refHaplotype.getBases())
- );
- }
- }
-
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
- protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype ) {
+ protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) {
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph);
@@ -254,6 +143,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
// something went wrong, so abort right now with a null graph
return null;
+ // add the artificial GGA haplotypes to the graph
+ if ( ! addGGAKmersToGraph(builder, activeAlleleHaplotypes) )
+ // something went wrong, so abort right now with a null graph
+ return null;
+
// now go through the graph already seeded with the reference sequence and add the read kmers to it
if ( ! addReadKmersToGraph(builder, reads) )
// some problem was detected adding the reads to the graph, return null to indicate we failed
@@ -263,6 +157,28 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
return graph;
}
+ /**
+ * Add the high-quality kmers from the artificial GGA haplotypes to the graph
+ *
+ * @param builder a debruijn graph builder to add the read kmers to
+ * @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode
+ * @return true if we successfully added the read kmers to the graph without corrupting it in some way
+ */
+ protected boolean addGGAKmersToGraph(final DeBruijnGraphBuilder builder, final List activeAlleleHaplotypes) {
+
+ final int kmerLength = builder.getKmerSize();
+
+ for( final Haplotype haplotype : activeAlleleHaplotypes ) {
+ final int end = haplotype.length() - kmerLength;
+ for( int start = 0; start < end; start++ ) {
+ builder.addKmerPairFromSeqToGraph( haplotype.getBases(), start, GGA_MODE_ARTIFICIAL_COUNTS );
+ }
+ }
+
+ // always returns true now, but it's possible that we'd add kmers and decide we don't like the graph in some way
+ return true;
+ }
+
/**
* Add the high-quality kmers from the reads to the graph
*
@@ -344,290 +260,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
return true;
}
- protected void printGraphs(final List graphs) {
- final int writeFirstGraphWithSizeSmallerThan = 50;
-
- graphWriter.println("digraph assemblyGraphs {");
- for( final SeqGraph graph : graphs ) {
- if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) {
- logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize());
- continue;
- }
-
- graph.printGraph(graphWriter, false, pruneFactor);
-
- if ( debugGraphTransformations )
- break;
- }
-
- graphWriter.println("}");
- }
-
- @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
- @Ensures({"result.contains(refHaplotype)"})
- private List findBestPaths( final List graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
-
- // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
- // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm
- final List returnHaplotypes = new ArrayList();
- refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
- final Cigar c = new Cigar();
- c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M));
- refHaplotype.setCigar(c);
- returnHaplotypes.add( refHaplotype );
-
- final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
- final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
-
- // for GGA mode, add the desired allele into the haplotype
- for( final VariantContext compVC : activeAllelesToGenotype ) {
- for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
- final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
- addHaplotypeForGGA( insertedRefHaplotype, refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true );
- }
- }
-
- for( final SeqGraph graph : graphs ) {
- final SeqVertex source = graph.getReferenceSourceVertex();
- final SeqVertex sink = graph.getReferenceSinkVertex();
- if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
-
- final KBestPaths pathFinder = new KBestPaths(allowCyclesInKmerGraphToGeneratePaths);
- for ( final Path path : pathFinder.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH, source, sink) ) {
-// logger.info("Found path " + path);
- Haplotype h = new Haplotype( path.getBases() );
- if( !returnHaplotypes.contains(h) ) {
- final Cigar cigar = path.calculateCigar();
- if( cigar.isEmpty() ) {
- throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength());
- } else if ( pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < 60 ) { // N cigar elements means that a bubble was too divergent from the reference so skip over this path
- continue;
- } else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure
- throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength());
- }
- h.setCigar(cigar);
-
- // extend partial haplotypes which are anchored in the reference to include the full active region
- h = extendPartialHaplotype(h, activeRegionStart, refWithPadding);
- final Cigar leftAlignedCigar = leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(h.getCigar()), refWithPadding, h.getBases(), activeRegionStart, 0);
- if( leftAlignedCigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // left alignment failure
- continue;
- }
- if( !returnHaplotypes.contains(h) ) {
- h.setAlignmentStartHapwrtRef(activeRegionStart);
- h.setCigar(leftAlignedCigar);
- h.setScore(path.getScore());
- returnHaplotypes.add(h);
-
- if ( debug )
- logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
-
- // for GGA mode, add the desired allele into the haplotype if it isn't already present
- if( !activeAllelesToGenotype.isEmpty() ) {
- final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
- for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
- final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
-
- // This if statement used to additionally have:
- // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
- // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
- // a haplotype that already contains a 1bp insertion (so practically it is reference but
- // falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
- if( vcOnHaplotype == null ) {
- for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
- addHaplotypeForGGA( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), refWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false );
- }
- }
- }
- }
- }
- }
- }
- }
-
- // add genome locs to the haplotypes
- for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow);
-
- if ( returnHaplotypes.size() < returnHaplotypes.size() )
- logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
-
- if( debug ) {
- if( returnHaplotypes.size() > 1 ) {
- logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
- } else {
- logger.info("Found only the reference haplotype in the assembly graph.");
- }
- for( final Haplotype h : returnHaplotypes ) {
- logger.info( h.toString() );
- logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() );
- }
- }
-
- return returnHaplotypes;
- }
-
- /**
- * Extend partial haplotypes which are anchored in the reference to include the full active region
- * @param haplotype the haplotype to extend
- * @param activeRegionStart the place where the active region starts in the ref byte array
- * @param refWithPadding the full reference byte array with padding which encompasses the active region
- * @return a haplotype fully extended to encompass the active region
- */
- @Requires({"haplotype != null", "activeRegionStart >= 0", "refWithPadding != null", "refWithPadding.length > 0"})
- @Ensures({"result != null", "result.getCigar() != null"})
- private Haplotype extendPartialHaplotype( final Haplotype haplotype, final int activeRegionStart, final byte[] refWithPadding ) {
- final Cigar cigar = haplotype.getCigar();
- final Cigar newCigar = new Cigar();
- byte[] newHaplotypeBases = haplotype.getBases();
- int refPos = activeRegionStart;
- int hapPos = 0;
- for( int iii = 0; iii < cigar.getCigarElements().size(); iii++ ) {
- final CigarElement ce = cigar.getCigarElement(iii);
- switch (ce.getOperator()) {
- case M:
- refPos += ce.getLength();
- hapPos += ce.getLength();
- newCigar.add(ce);
- break;
- case I:
- hapPos += ce.getLength();
- newCigar.add(ce);
- break;
- case D:
- if( iii == 0 || iii == cigar.getCigarElements().size() - 1 ) {
- newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos),
- ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()),
- Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length)));
- hapPos += ce.getLength();
- refPos += ce.getLength();
- newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M));
- } else {
- refPos += ce.getLength();
- newCigar.add(ce);
- }
- break;
- default:
- throw new IllegalStateException("Unsupported cigar operator detected: " + ce.getOperator());
- }
- }
- final Haplotype returnHaplotype = new Haplotype(newHaplotypeBases, haplotype.isReference());
- returnHaplotype.setCigar( newCigar );
- return returnHaplotype;
- }
-
- /**
- * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal
- * @param c the cigar to test
- * @return true if we should skip over this path
- */
- @Requires("c != null")
- private boolean pathIsTooDivergentFromReference( final Cigar c ) {
- for( final CigarElement ce : c.getCigarElements() ) {
- if( ce.getOperator().equals(CigarOperator.N) ) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them.
- * This is a target of future work to incorporate and generalize into AlignmentUtils for use by others.
- * @param cigar the cigar to left align
- * @param refSeq the reference byte array
- * @param readSeq the read byte array
- * @param refIndex 0-based alignment start position on ref
- * @param readIndex 0-based alignment start position on read
- * @return the left-aligned cigar
- */
- @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"})
- protected Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
- final Cigar cigarToReturn = new Cigar();
- Cigar cigarToAlign = new Cigar();
- for (int i = 0; i < cigar.numCigarElements(); i++) {
- final CigarElement ce = cigar.getCigarElement(i);
- if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) {
- cigarToAlign.add(ce);
- for( final CigarElement toAdd : AlignmentUtils.leftAlignIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false).getCigarElements() ) {
- cigarToReturn.add(toAdd);
- }
- refIndex += cigarToAlign.getReferenceLength();
- readIndex += cigarToAlign.getReadLength();
- cigarToAlign = new Cigar();
- } else {
- cigarToAlign.add(ce);
- }
- }
- if( !cigarToAlign.isEmpty() ) {
- for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) {
- cigarToReturn.add(toAdd);
- }
- }
- return cigarToReturn;
- }
-
- /**
- * Take a haplotype which was generated by injecting an allele into a string of bases and run SW against the reference to determine the variants on the haplotype.
- * Unfortunately since this haplotype didn't come from the assembly graph you can't straightforwardly use the bubble traversal algorithm to get this information.
- * This is a target for future work as we rewrite the HaplotypeCaller to be more bubble-caller based.
- * @param haplotype the candidate haplotype
- * @param ref the reference bases to align against
- * @param haplotypeList the current list of haplotypes
- * @param activeRegionStart the start of the active region in the reference byte array
- * @param activeRegionStop the stop of the active region in the reference byte array
- * @param FORCE_INCLUSION_FOR_GGA_MODE if true will include in the list even if it already exists
- * @return true if the candidate haplotype was successfully incorporated into the haplotype list
- */
- @Requires({"ref != null", "ref.length >= activeRegionStop - activeRegionStart"})
- private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final List haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
- if( haplotype == null ) { return false; }
-
- final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS );
- haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
-
- if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
- return false;
- }
-
- haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) );
-
- final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate(haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true);
- int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
- if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
- hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
- }
- byte[] newHaplotypeBases;
- // extend partial haplotypes to contain the full active region sequence
- if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
- newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
- haplotype.getBases()),
- ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
- } else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
- newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
- } else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
- newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
- } else {
- newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
- }
-
- final Haplotype h = new Haplotype( newHaplotypeBases );
- final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS );
-
- h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
- if ( haplotype.isArtificialHaplotype() ) {
- h.setArtificialEvent(haplotype.getArtificialEvent());
- }
- if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart || swConsensus2.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
- return false;
- }
-
- h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) );
-
- if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) {
- haplotypeList.add(h);
- return true;
- } else {
- return false;
- }
+ @Override
+ public String toString() {
+ return "DeBruijnAssembler{" +
+ "minKmer=" + minKmer +
+ '}';
}
}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index 419ea378f..04173b64f 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -49,6 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
@@ -71,7 +72,7 @@ public class GenotypingEngine {
private final boolean DEBUG;
private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS;
- private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied
+ private final static List noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied
private final VariantAnnotatorEngine annotationEngine;
private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
@@ -146,6 +147,7 @@ public class GenotypingEngine {
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
+ final RefMetaDataTracker tracker,
final List activeAllelesToGenotype ) {
// sanity check input arguments
if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
@@ -162,8 +164,8 @@ public class GenotypingEngine {
final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype);
// Walk along each position in the key set and create each event to be outputted
- final Set calledHaplotypes = new HashSet();
- final List returnCalls = new ArrayList();
+ final Set calledHaplotypes = new HashSet<>();
+ final List returnCalls = new ArrayList<>();
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
@@ -183,7 +185,7 @@ public class GenotypingEngine {
if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) {
throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles.");
}
- final Map mergeMap = new LinkedHashMap();
+ final Map mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) {
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
@@ -204,13 +206,12 @@ public class GenotypingEngine {
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
- VariantContext annotatedCall = call;
- if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
+ VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call);
+
+ if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
}
- annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, annotatedCall);
-
// maintain the set of all called haplotypes
for ( final Allele calledAllele : call.getAlleles() )
calledHaplotypes.addAll(alleleMapper.get(calledAllele));
@@ -244,7 +245,7 @@ public class GenotypingEngine {
if ( in_GGA_mode ) startPosKeySet.clear();
- cleanUpSymbolicUnassembledEvents( haplotypes );
+ //cleanUpSymbolicUnassembledEvents( haplotypes ); // We don't make symbolic alleles so this isn't needed currently
if ( !in_GGA_mode ) {
// run the event merger if we're not in GGA mode
final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc);
@@ -267,7 +268,7 @@ public class GenotypingEngine {
* @return the list of the sources of vcs in the same order
*/
private List makePriorityList(final List vcs) {
- final List priorityList = new LinkedList();
+ final List priorityList = new LinkedList<>();
for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource());
return priorityList;
}
@@ -276,7 +277,7 @@ public class GenotypingEngine {
final int loc,
final List activeAllelesToGenotype) {
// the overlapping events to merge into a common reference view
- final List eventsAtThisLoc = new ArrayList();
+ final List eventsAtThisLoc = new ArrayList<>();
if( activeAllelesToGenotype.isEmpty() ) {
for( final Haplotype h : haplotypes ) {
@@ -292,7 +293,7 @@ public class GenotypingEngine {
if( compVC.getStart() == loc ) {
int alleleCount = 0;
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
- List alleleSet = new ArrayList(2);
+ List alleleSet = new ArrayList<>(2);
alleleSet.add(compVC.getReference());
alleleSet.add(compAltAllele);
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
@@ -348,7 +349,7 @@ public class GenotypingEngine {
final Map> perSampleFilteredReadList,
final VariantContext call ) {
- final Map returnMap = new LinkedHashMap();
+ final Map returnMap = new LinkedHashMap<>();
final GenomeLoc callLoc = parser.createGenomeLoc(call);
for( final Map.Entry sample : perSampleReadMap.entrySet() ) {
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
@@ -384,7 +385,7 @@ public class GenotypingEngine {
// TODO - split into input haplotypes and output haplotypes as not to share I/O arguments
@Requires("haplotypes != null")
protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) {
- final List haplotypesToRemove = new ArrayList();
+ final List haplotypesToRemove = new ArrayList<>();
for( final Haplotype h : haplotypes ) {
for( final VariantContext vc : h.getEventMap().getVariantContexts() ) {
if( vc.isSymbolic() ) {
@@ -407,7 +408,7 @@ public class GenotypingEngine {
final Map> alleleMapper,
final double downsamplingFraction ) {
- final Map alleleReadMap = new LinkedHashMap();
+ final Map